diff --git a/arte_api.py b/arte_api.py index a865e12..cdb7883 100644 --- a/arte_api.py +++ b/arte_api.py @@ -1,95 +1,116 @@ -import asyncio -import logging +import re import time -import yt_dlp +import logging +import asyncio +import urllib.request +import json +from concurrent.futures import ThreadPoolExecutor logger = logging.getLogger(__name__) CACHE_TTL = 6 * 3600 _cache: dict = {"data": [], "ts": 0} -ARTE_CONCERT_URL = "https://www.arte.tv/fr/videos/RC-014034/arte-concert/" +PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}" + +GENRE_PAGES = [ + ("classique", "https://www.arte.tv/fr/arte-concert/classique/"), + ("jazz", "https://www.arte.tv/fr/arte-concert/jazz/"), + ("rock-pop", "https://www.arte.tv/fr/arte-concert/rock-pop/"), + ("opéra", "https://www.arte.tv/fr/arte-concert/opera/"), + ("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"), + ("électronique", "https://www.arte.tv/fr/arte-concert/electronica/"), + ("agenda", "https://www.arte.tv/fr/arte-concert/agenda/"), + ("", "https://www.arte.tv/fr/arte-concert/"), +] + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "Chrome/120.0.0.0 Safari/537.36" + ), + "RSC": "1", +} + +_PROG_RE = re.compile(r"\b(\d{6}-\d{3}-[A-Z])\b") -def _best_thumbnail(entry: dict) -> str: - thumbs = entry.get("thumbnails") or [] - if thumbs: - # prefer largest - sorted_thumbs = sorted(thumbs, key=lambda t: t.get("width", 0), reverse=True) - return sorted_thumbs[0].get("url", "") - return entry.get("thumbnail", "") +def _fetch_url(url: str, headers: dict | None = None) -> str: + req = urllib.request.Request(url, headers=headers or _HEADERS) + with urllib.request.urlopen(req, timeout=15) as r: + return r.read().decode("utf-8", errors="replace") -def _normalize(e: dict) -> dict | None: - if not e or not e.get("id"): +def _prog_ids_from_page(url: str) -> set[str]: + try: + html = _fetch_url(url) + return set(_PROG_RE.findall(html)) + except Exception as ex: + logger.warning("Failed to fetch %s: %s", url, ex) + return set() + + +def _metadata_for_pid(pid: str) -> dict | None: + try: + raw = _fetch_url( + PLAYER_API.format(pid=pid), + headers={"User-Agent": _HEADERS["User-Agent"], "Accept": "application/json"}, + ) + data = json.loads(raw) + attrs = data["data"]["attributes"] + meta = attrs["metadata"] + + url = (meta.get("link") or {}).get("url") or f"https://www.arte.tv/fr/videos/{pid}/" + imgs = meta.get("images") or [] + thumbnail = imgs[0]["url"] if imgs else "" + duration_s = (meta.get("duration") or {}).get("seconds") + rights = attrs.get("rights") or {} + + return { + "id": pid, + "title": meta.get("title", ""), + "subtitle": meta.get("subtitle", ""), + "url": url, + "thumbnail": thumbnail, + "duration": duration_s, + "description": meta.get("description", "") or "", + "expiry": rights.get("end", ""), + } + except Exception as ex: + logger.debug("Failed to get metadata for %s: %s", pid, ex) return None - video_id = e.get("id", "") - url = ( - e.get("url") - or e.get("webpage_url") - or f"https://www.arte.tv/fr/videos/{video_id}/" - ) - return { - "id": video_id, - "title": e.get("title", ""), - "url": url, - "thumbnail": _best_thumbnail(e), - "duration": e.get("duration"), - "description": e.get("description", ""), - "upload_date": e.get("upload_date", ""), - "release_timestamp": e.get("release_timestamp"), - } -def _fetch_sync() -> list: - concerts: list = [] - seen: set = set() +def _fetch_all_sync() -> list[dict]: + # 1 — collect programme IDs across all genre pages + all_ids: set[str] = set() + for _genre, url in GENRE_PAGES: + ids = _prog_ids_from_page(url) + logger.info(" %s → %d IDs", url.split("/fr/")[1], len(ids)) + all_ids |= ids - ydl_opts = { - "quiet": True, - "no_warnings": True, - "extract_flat": True, - "ignoreerrors": True, - } + logger.info("Total unique programme IDs: %d", len(all_ids)) - def _collect(entries: list, ydl, depth: int = 0): - for e in entries or []: - if not e: - continue - etype = e.get("_type", "") - # sub-collection → recurse one level - if etype in ("playlist", "url_transparent") and depth < 1: - sub_url = e.get("url") or e.get("webpage_url") - if sub_url: - try: - info = ydl.extract_info(sub_url, download=False) - if info: - _collect(info.get("entries", []), ydl, depth + 1) - except Exception as ex: - logger.debug("sub-collection error: %s", ex) - continue - entry = _normalize(e) - if entry and entry["id"] not in seen: - seen.add(entry["id"]) - concerts.append(entry) + # 2 — fetch metadata concurrently + concerts: list[dict] = [] + with ThreadPoolExecutor(max_workers=10) as pool: + results = list(pool.map(_metadata_for_pid, sorted(all_ids))) - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - try: - info = ydl.extract_info(ARTE_CONCERT_URL, download=False) - if info: - _collect(info.get("entries", []), ydl) - except Exception as ex: - logger.error("fetch error: %s", ex) + for c in results: + if c and c["title"]: + concerts.append(c) + concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True) return concerts -async def get_all_concerts() -> list: +async def get_all_concerts() -> list[dict]: now = time.time() if _cache["data"] and now - _cache["ts"] < CACHE_TTL: return _cache["data"] + loop = asyncio.get_event_loop() - data = await loop.run_in_executor(None, _fetch_sync) + data = await loop.run_in_executor(None, _fetch_all_sync) if data: _cache["data"] = data _cache["ts"] = now @@ -101,11 +122,15 @@ async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) - filtered = all_c if search: q = search.lower() - filtered = [c for c in all_c if q in c["title"].lower() or q in c["description"].lower()] + filtered = [ + c for c in all_c + if q in c["title"].lower() + or q in c.get("subtitle", "").lower() + or q in c.get("description", "").lower() + ] start = (page - 1) * page_size - page_data = filtered[start : start + page_size] return { - "concerts": page_data, + "concerts": filtered[start : start + page_size], "total": len(filtered), "page": page, "page_size": page_size,