diff --git a/arte_api.py b/arte_api.py index f43cedc..65ec286 100644 --- a/arte_api.py +++ b/arte_api.py @@ -110,12 +110,54 @@ def _fetch_url(url: str, headers: dict | None = None) -> str: return r.read().decode("utf-8", errors="replace") -def _prog_ids_from_page(url: str) -> set[str]: +EMAC_ZONE_API = "https://api.arte.tv/api/emac/v4/fr/web/zones/{zone_id}/content?page={page}" +_EMAC_HEADERS = {"User-Agent": _HEADERS["User-Agent"], "Origin": "https://www.arte.tv"} +_ZONE_RE = re.compile(r"/api/emac/v4/fr/web/zones/([a-f0-9-]+)/content\?page=1") + + +def _zone_ids_from_page(url: str) -> list[str]: try: - return set(_PROG_RE.findall(_fetch_url(url))) + html = _fetch_url(url) + return _ZONE_RE.findall(html) except Exception as ex: logger.warning("Failed to fetch %s: %s", url, ex) - return set() + return [] + + +def _fetch_zone_concerts(zone_id: str, category: str) -> list[dict]: + """Fetch all concerts from a single EMAC zone (paginated).""" + concerts = [] + page = 1 + while True: + url = EMAC_ZONE_API.format(zone_id=zone_id, page=page) + try: + raw = _fetch_url(url, headers=_EMAC_HEADERS) + data = json.loads(raw) + except Exception as ex: + logger.warning("EMAC zone %s page %d failed: %s", zone_id, page, ex) + break + for item in data.get("data", []): + pid = item.get("programId") or "" + if not pid: + continue + img = item.get("mainImage") or {} + avail = item.get("availability") or {} + concerts.append({ + "id": pid, + "title": item.get("title") or "", + "subtitle": item.get("subtitle") or "", + "url": item.get("url") or f"https://www.arte.tv/fr/videos/{pid}/", + "thumbnail": img.get("url") or "", + "duration": item.get("duration"), + "description": item.get("shortDescription") or item.get("teaserText") or "", + "expiry": avail.get("end") or "", + }) + pagination = data.get("pagination", {}) + if page >= pagination.get("pages", 1): + break + page += 1 + logger.info(" %s (zone %s…) → %d concerts", category, zone_id[:8], len(concerts)) + return concerts def _metadata_for_pid(pid: str) -> dict | None: @@ -150,21 +192,41 @@ def _metadata_for_pid(pid: str) -> dict | None: def _fetch_all_sync() -> list[dict]: + by_id: dict[str, dict] = {} id_cats: dict[str, list[str]] = {} + + # Genre pages: use EMAC zone API (paginated) — gets ALL concerts, not just initial HTML for name, url in GENRE_PAGES: - ids = _prog_ids_from_page(url) - logger.info(" %s → %d IDs", name, len(ids)) - for pid in ids: + zone_ids = _zone_ids_from_page(url) + if not zone_ids: + logger.warning("No zone IDs found for %s (%s)", name, url) + continue + # All zones on a page are identical copies; use the first one + concerts = _fetch_zone_concerts(zone_ids[0], name) + for c in concerts: + pid = c["id"] id_cats.setdefault(pid, []).append(name) + if pid not in by_id: + by_id[pid] = c - all_ids: set[str] = set(id_cats) + # Extra pages: fall back to regex (no EMAC zones) for url in EXTRA_PAGES: - ids = _prog_ids_from_page(url) - logger.info(" %s → %d IDs", url.split("/fr/")[1], len(ids)) - all_ids |= ids - logger.info("Total unique programme IDs: %d", len(all_ids)) + try: + extra_ids = set(_PROG_RE.findall(_fetch_url(url))) + except Exception as ex: + logger.warning("Failed to fetch %s: %s", url, ex) + extra_ids = set() + new_ids = extra_ids - set(by_id) + logger.info(" %s → %d new IDs", url.split("/fr/")[1], len(new_ids)) + if new_ids: + with ThreadPoolExecutor(max_workers=10) as pool: + results = list(pool.map(_metadata_for_pid, sorted(new_ids))) + for meta in results: + if meta and meta.get("title"): + by_id[meta["id"]] = meta - concerts = _resolve_ids(all_ids) + logger.info("Total unique programme IDs: %d", len(by_id)) + concerts = list(by_id.values()) for c in concerts: c["categories"] = id_cats.get(c["id"], [])