From eac520bc8ec027e9efb29254a6a0dbfa6491b5d5 Mon Sep 17 00:00:00 2001 From: dev Date: Mon, 18 May 2026 17:24:27 +0200 Subject: [PATCH] feat: use EMAC zone API for paginated genre scraping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously only ~17 IDs were extracted from initial HTML per genre page (before the "voir plus" button). The EMAC API at api.arte.tv exposes all concerts with full pagination (e.g. 131 Metal, 187 Pop Rock). Also reuses metadata from EMAC response (title, subtitle, thumbnail, expiry) — skipping redundant player API calls for genre concerts. Co-Authored-By: Claude Sonnet 4.6 --- arte_api.py | 86 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 12 deletions(-) diff --git a/arte_api.py b/arte_api.py index f43cedc..65ec286 100644 --- a/arte_api.py +++ b/arte_api.py @@ -110,12 +110,54 @@ def _fetch_url(url: str, headers: dict | None = None) -> str: return r.read().decode("utf-8", errors="replace") -def _prog_ids_from_page(url: str) -> set[str]: +EMAC_ZONE_API = "https://api.arte.tv/api/emac/v4/fr/web/zones/{zone_id}/content?page={page}" +_EMAC_HEADERS = {"User-Agent": _HEADERS["User-Agent"], "Origin": "https://www.arte.tv"} +_ZONE_RE = re.compile(r"/api/emac/v4/fr/web/zones/([a-f0-9-]+)/content\?page=1") + + +def _zone_ids_from_page(url: str) -> list[str]: try: - return set(_PROG_RE.findall(_fetch_url(url))) + html = _fetch_url(url) + return _ZONE_RE.findall(html) except Exception as ex: logger.warning("Failed to fetch %s: %s", url, ex) - return set() + return [] + + +def _fetch_zone_concerts(zone_id: str, category: str) -> list[dict]: + """Fetch all concerts from a single EMAC zone (paginated).""" + concerts = [] + page = 1 + while True: + url = EMAC_ZONE_API.format(zone_id=zone_id, page=page) + try: + raw = _fetch_url(url, headers=_EMAC_HEADERS) + data = json.loads(raw) + except Exception as ex: + logger.warning("EMAC zone %s page %d failed: %s", zone_id, page, ex) + break + for item in data.get("data", []): + pid = item.get("programId") or "" + if not pid: + continue + img = item.get("mainImage") or {} + avail = item.get("availability") or {} + concerts.append({ + "id": pid, + "title": item.get("title") or "", + "subtitle": item.get("subtitle") or "", + "url": item.get("url") or f"https://www.arte.tv/fr/videos/{pid}/", + "thumbnail": img.get("url") or "", + "duration": item.get("duration"), + "description": item.get("shortDescription") or item.get("teaserText") or "", + "expiry": avail.get("end") or "", + }) + pagination = data.get("pagination", {}) + if page >= pagination.get("pages", 1): + break + page += 1 + logger.info(" %s (zone %s…) → %d concerts", category, zone_id[:8], len(concerts)) + return concerts def _metadata_for_pid(pid: str) -> dict | None: @@ -150,21 +192,41 @@ def _metadata_for_pid(pid: str) -> dict | None: def _fetch_all_sync() -> list[dict]: + by_id: dict[str, dict] = {} id_cats: dict[str, list[str]] = {} + + # Genre pages: use EMAC zone API (paginated) — gets ALL concerts, not just initial HTML for name, url in GENRE_PAGES: - ids = _prog_ids_from_page(url) - logger.info(" %s → %d IDs", name, len(ids)) - for pid in ids: + zone_ids = _zone_ids_from_page(url) + if not zone_ids: + logger.warning("No zone IDs found for %s (%s)", name, url) + continue + # All zones on a page are identical copies; use the first one + concerts = _fetch_zone_concerts(zone_ids[0], name) + for c in concerts: + pid = c["id"] id_cats.setdefault(pid, []).append(name) + if pid not in by_id: + by_id[pid] = c - all_ids: set[str] = set(id_cats) + # Extra pages: fall back to regex (no EMAC zones) for url in EXTRA_PAGES: - ids = _prog_ids_from_page(url) - logger.info(" %s → %d IDs", url.split("/fr/")[1], len(ids)) - all_ids |= ids - logger.info("Total unique programme IDs: %d", len(all_ids)) + try: + extra_ids = set(_PROG_RE.findall(_fetch_url(url))) + except Exception as ex: + logger.warning("Failed to fetch %s: %s", url, ex) + extra_ids = set() + new_ids = extra_ids - set(by_id) + logger.info(" %s → %d new IDs", url.split("/fr/")[1], len(new_ids)) + if new_ids: + with ThreadPoolExecutor(max_workers=10) as pool: + results = list(pool.map(_metadata_for_pid, sorted(new_ids))) + for meta in results: + if meta and meta.get("title"): + by_id[meta["id"]] = meta - concerts = _resolve_ids(all_ids) + logger.info("Total unique programme IDs: %d", len(by_id)) + concerts = list(by_id.values()) for c in concerts: c["categories"] = id_cats.get(c["id"], [])