feat: use EMAC zone API for paginated genre scraping
Docker / docker (push) Successful in 1m29s

Previously only ~17 IDs were extracted from initial HTML per genre page
(before the "voir plus" button). The EMAC API at api.arte.tv exposes all
concerts with full pagination (e.g. 131 Metal, 187 Pop Rock).

Also reuses metadata from EMAC response (title, subtitle, thumbnail,
expiry) — skipping redundant player API calls for genre concerts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
dev
2026-05-18 17:24:27 +02:00
parent 56a210b4b7
commit eac520bc8e
+74 -12
View File
@@ -110,12 +110,54 @@ def _fetch_url(url: str, headers: dict | None = None) -> str:
return r.read().decode("utf-8", errors="replace")
def _prog_ids_from_page(url: str) -> set[str]:
EMAC_ZONE_API = "https://api.arte.tv/api/emac/v4/fr/web/zones/{zone_id}/content?page={page}"
_EMAC_HEADERS = {"User-Agent": _HEADERS["User-Agent"], "Origin": "https://www.arte.tv"}
_ZONE_RE = re.compile(r"/api/emac/v4/fr/web/zones/([a-f0-9-]+)/content\?page=1")
def _zone_ids_from_page(url: str) -> list[str]:
try:
return set(_PROG_RE.findall(_fetch_url(url)))
html = _fetch_url(url)
return _ZONE_RE.findall(html)
except Exception as ex:
logger.warning("Failed to fetch %s: %s", url, ex)
return set()
return []
def _fetch_zone_concerts(zone_id: str, category: str) -> list[dict]:
"""Fetch all concerts from a single EMAC zone (paginated)."""
concerts = []
page = 1
while True:
url = EMAC_ZONE_API.format(zone_id=zone_id, page=page)
try:
raw = _fetch_url(url, headers=_EMAC_HEADERS)
data = json.loads(raw)
except Exception as ex:
logger.warning("EMAC zone %s page %d failed: %s", zone_id, page, ex)
break
for item in data.get("data", []):
pid = item.get("programId") or ""
if not pid:
continue
img = item.get("mainImage") or {}
avail = item.get("availability") or {}
concerts.append({
"id": pid,
"title": item.get("title") or "",
"subtitle": item.get("subtitle") or "",
"url": item.get("url") or f"https://www.arte.tv/fr/videos/{pid}/",
"thumbnail": img.get("url") or "",
"duration": item.get("duration"),
"description": item.get("shortDescription") or item.get("teaserText") or "",
"expiry": avail.get("end") or "",
})
pagination = data.get("pagination", {})
if page >= pagination.get("pages", 1):
break
page += 1
logger.info(" %s (zone %s…) → %d concerts", category, zone_id[:8], len(concerts))
return concerts
def _metadata_for_pid(pid: str) -> dict | None:
@@ -150,21 +192,41 @@ def _metadata_for_pid(pid: str) -> dict | None:
def _fetch_all_sync() -> list[dict]:
by_id: dict[str, dict] = {}
id_cats: dict[str, list[str]] = {}
# Genre pages: use EMAC zone API (paginated) — gets ALL concerts, not just initial HTML
for name, url in GENRE_PAGES:
ids = _prog_ids_from_page(url)
logger.info(" %s%d IDs", name, len(ids))
for pid in ids:
zone_ids = _zone_ids_from_page(url)
if not zone_ids:
logger.warning("No zone IDs found for %s (%s)", name, url)
continue
# All zones on a page are identical copies; use the first one
concerts = _fetch_zone_concerts(zone_ids[0], name)
for c in concerts:
pid = c["id"]
id_cats.setdefault(pid, []).append(name)
if pid not in by_id:
by_id[pid] = c
all_ids: set[str] = set(id_cats)
# Extra pages: fall back to regex (no EMAC zones)
for url in EXTRA_PAGES:
ids = _prog_ids_from_page(url)
logger.info(" %s%d IDs", url.split("/fr/")[1], len(ids))
all_ids |= ids
logger.info("Total unique programme IDs: %d", len(all_ids))
try:
extra_ids = set(_PROG_RE.findall(_fetch_url(url)))
except Exception as ex:
logger.warning("Failed to fetch %s: %s", url, ex)
extra_ids = set()
new_ids = extra_ids - set(by_id)
logger.info(" %s%d new IDs", url.split("/fr/")[1], len(new_ids))
if new_ids:
with ThreadPoolExecutor(max_workers=10) as pool:
results = list(pool.map(_metadata_for_pid, sorted(new_ids)))
for meta in results:
if meta and meta.get("title"):
by_id[meta["id"]] = meta
concerts = _resolve_ids(all_ids)
logger.info("Total unique programme IDs: %d", len(by_id))
concerts = list(by_id.values())
for c in concerts:
c["categories"] = id_cats.get(c["id"], [])