feat: use EMAC zone API for paginated genre scraping

Previously only ~17 IDs were extracted from initial HTML per genre page (before the "voir plus" button). The EMAC API at api.arte.tv exposes all concerts with full pagination (e.g. 131 Metal, 187 Pop Rock). Also reuses metadata from EMAC response (title, subtitle, thumbnail, expiry) — skipping redundant player API calls for genre concerts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 17:24:27 +02:00
parent 56a210b4b7
commit eac520bc8e
1 changed files with 74 additions and 12 deletions
@@ -110,12 +110,54 @@ def _fetch_url(url: str, headers: dict | None = None) -> str:
        return r.read().decode("utf-8", errors="replace")


-def _prog_ids_from_page(url: str) -> set[str]:
+EMAC_ZONE_API = "https://api.arte.tv/api/emac/v4/fr/web/zones/{zone_id}/content?page={page}"
+_EMAC_HEADERS = {"User-Agent": _HEADERS["User-Agent"], "Origin": "https://www.arte.tv"}
+_ZONE_RE = re.compile(r"/api/emac/v4/fr/web/zones/([a-f0-9-]+)/content\?page=1")
+
+
+def _zone_ids_from_page(url: str) -> list[str]:
    try:
-        return set(_PROG_RE.findall(_fetch_url(url)))
+        html = _fetch_url(url)
+        return _ZONE_RE.findall(html)
    except Exception as ex:
        logger.warning("Failed to fetch %s: %s", url, ex)
-        return set()
+        return []
+
+
+def _fetch_zone_concerts(zone_id: str, category: str) -> list[dict]:
+    """Fetch all concerts from a single EMAC zone (paginated)."""
+    concerts = []
+    page = 1
+    while True:
+        url = EMAC_ZONE_API.format(zone_id=zone_id, page=page)
+        try:
+            raw = _fetch_url(url, headers=_EMAC_HEADERS)
+            data = json.loads(raw)
+        except Exception as ex:
+            logger.warning("EMAC zone %s page %d failed: %s", zone_id, page, ex)
+            break
+        for item in data.get("data", []):
+            pid = item.get("programId") or ""
+            if not pid:
+                continue
+            img = item.get("mainImage") or {}
+            avail = item.get("availability") or {}
+            concerts.append({
+                "id": pid,
+                "title": item.get("title") or "",
+                "subtitle": item.get("subtitle") or "",
+                "url": item.get("url") or f"https://www.arte.tv/fr/videos/{pid}/",
+                "thumbnail": img.get("url") or "",
+                "duration": item.get("duration"),
+                "description": item.get("shortDescription") or item.get("teaserText") or "",
+                "expiry": avail.get("end") or "",
+            })
+        pagination = data.get("pagination", {})
+        if page >= pagination.get("pages", 1):
+            break
+        page += 1
+    logger.info("  %s (zone %s…) → %d concerts", category, zone_id[:8], len(concerts))
+    return concerts


 def _metadata_for_pid(pid: str) -> dict | None:
@@ -150,21 +192,41 @@ def _metadata_for_pid(pid: str) -> dict | None:


 def _fetch_all_sync() -> list[dict]:
+    by_id: dict[str, dict] = {}
    id_cats: dict[str, list[str]] = {}
+
+    # Genre pages: use EMAC zone API (paginated) — gets ALL concerts, not just initial HTML
    for name, url in GENRE_PAGES:
-        ids = _prog_ids_from_page(url)
-        logger.info("  %s → %d IDs", name, len(ids))
-        for pid in ids:
+        zone_ids = _zone_ids_from_page(url)
+        if not zone_ids:
+            logger.warning("No zone IDs found for %s (%s)", name, url)
+            continue
+        # All zones on a page are identical copies; use the first one
+        concerts = _fetch_zone_concerts(zone_ids[0], name)
+        for c in concerts:
+            pid = c["id"]
            id_cats.setdefault(pid, []).append(name)
+            if pid not in by_id:
+                by_id[pid] = c

-    all_ids: set[str] = set(id_cats)
+    # Extra pages: fall back to regex (no EMAC zones)
    for url in EXTRA_PAGES:
-        ids = _prog_ids_from_page(url)
-        logger.info("  %s → %d IDs", url.split("/fr/")[1], len(ids))
-        all_ids |= ids
-    logger.info("Total unique programme IDs: %d", len(all_ids))
+        try:
+            extra_ids = set(_PROG_RE.findall(_fetch_url(url)))
+        except Exception as ex:
+            logger.warning("Failed to fetch %s: %s", url, ex)
+            extra_ids = set()
+        new_ids = extra_ids - set(by_id)
+        logger.info("  %s → %d new IDs", url.split("/fr/")[1], len(new_ids))
+        if new_ids:
+            with ThreadPoolExecutor(max_workers=10) as pool:
+                results = list(pool.map(_metadata_for_pid, sorted(new_ids)))
+            for meta in results:
+                if meta and meta.get("title"):
+                    by_id[meta["id"]] = meta

-    concerts = _resolve_ids(all_ids)
+    logger.info("Total unique programme IDs: %d", len(by_id))
+    concerts = list(by_id.values())
    for c in concerts:
        c["categories"] = id_cats.get(c["id"], [])