fix: réécrire arte_api avec l'API player Arte et les pages RSC

Abandon de l'approche yt-dlp playlist (URL non supportée). Scrape les pages genre Arte Concert en RSC pour extraire les programme IDs, puis fetch les métadonnées (titre, thumbnail, durée, expiry) via l'API player v2 en parallèle (10 workers). 96 concerts disponibles. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-25 18:47:36 +02:00
parent eadc242173
commit 6bf6af4c73
1 changed files with 95 additions and 70 deletions
@@ -1,95 +1,116 @@
-import asyncio
+import re
 import logging
 import time
-import yt_dlp
+import logging
 import asyncio
 import urllib.request
 import json
 from concurrent.futures import ThreadPoolExecutor
 logger = logging.getLogger(__name__)
 CACHE_TTL = 6 * 3600
 _cache: dict = {"data": [], "ts": 0}
-ARTE_CONCERT_URL = "https://www.arte.tv/fr/videos/RC-014034/arte-concert/"
+PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}"
 GENRE_PAGES = [
    ("classique",       "https://www.arte.tv/fr/arte-concert/classique/"),
    ("jazz",            "https://www.arte.tv/fr/arte-concert/jazz/"),
    ("rock-pop",        "https://www.arte.tv/fr/arte-concert/rock-pop/"),
    ("opéra",          "https://www.arte.tv/fr/arte-concert/opera/"),
    ("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"),
    ("électronique",   "https://www.arte.tv/fr/arte-concert/electronica/"),
    ("agenda",          "https://www.arte.tv/fr/arte-concert/agenda/"),
    ("",                "https://www.arte.tv/fr/arte-concert/"),
 ]
 _HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "RSC": "1",
 }
 _PROG_RE = re.compile(r"\b(\d{6}-\d{3}-[A-Z])\b")
-def _best_thumbnail(entry: dict) -> str:
+def _fetch_url(url: str, headers: dict | None = None) -> str:
-    thumbs = entry.get("thumbnails") or []
+    req = urllib.request.Request(url, headers=headers or _HEADERS)
-    if thumbs:
+    with urllib.request.urlopen(req, timeout=15) as r:
-        # prefer largest
+        return r.read().decode("utf-8", errors="replace")
        sorted_thumbs = sorted(thumbs, key=lambda t: t.get("width", 0), reverse=True)
        return sorted_thumbs[0].get("url", "")
    return entry.get("thumbnail", "")
-def _normalize(e: dict) -> dict | None:
+def _prog_ids_from_page(url: str) -> set[str]:
-    if not e or not e.get("id"):
+    try:
        html = _fetch_url(url)
        return set(_PROG_RE.findall(html))
    except Exception as ex:
        logger.warning("Failed to fetch %s: %s", url, ex)
        return set()
 def _metadata_for_pid(pid: str) -> dict | None:
    try:
        raw = _fetch_url(
            PLAYER_API.format(pid=pid),
            headers={"User-Agent": _HEADERS["User-Agent"], "Accept": "application/json"},
        )
        data = json.loads(raw)
        attrs = data["data"]["attributes"]
        meta = attrs["metadata"]
        url = (meta.get("link") or {}).get("url") or f"https://www.arte.tv/fr/videos/{pid}/"
        imgs = meta.get("images") or []
        thumbnail = imgs[0]["url"] if imgs else ""
        duration_s = (meta.get("duration") or {}).get("seconds")
        rights = attrs.get("rights") or {}
        return {
            "id": pid,
            "title": meta.get("title", ""),
            "subtitle": meta.get("subtitle", ""),
            "url": url,
            "thumbnail": thumbnail,
            "duration": duration_s,
            "description": meta.get("description", "") or "",
            "expiry": rights.get("end", ""),
        }
    except Exception as ex:
        logger.debug("Failed to get metadata for %s: %s", pid, ex)
        return None
    video_id = e.get("id", "")
    url = (
        e.get("url")
        or e.get("webpage_url")
        or f"https://www.arte.tv/fr/videos/{video_id}/"
    )
    return {
        "id": video_id,
        "title": e.get("title", ""),
        "url": url,
        "thumbnail": _best_thumbnail(e),
        "duration": e.get("duration"),
        "description": e.get("description", ""),
        "upload_date": e.get("upload_date", ""),
        "release_timestamp": e.get("release_timestamp"),
    }
-def _fetch_sync() -> list:
+def _fetch_all_sync() -> list[dict]:
-    concerts: list = []
+    # 1 — collect programme IDs across all genre pages
-    seen: set = set()
+    all_ids: set[str] = set()
    for _genre, url in GENRE_PAGES:
        ids = _prog_ids_from_page(url)
        logger.info("  %s → %d IDs", url.split("/fr/")[1], len(ids))
        all_ids |= ids
-    ydl_opts = {
+    logger.info("Total unique programme IDs: %d", len(all_ids))
        "quiet": True,
        "no_warnings": True,
        "extract_flat": True,
        "ignoreerrors": True,
    }
-    def _collect(entries: list, ydl, depth: int = 0):
+    # 2 — fetch metadata concurrently
-        for e in entries or []:
+    concerts: list[dict] = []
-            if not e:
+    with ThreadPoolExecutor(max_workers=10) as pool:
-                continue
+        results = list(pool.map(_metadata_for_pid, sorted(all_ids)))
            etype = e.get("_type", "")
            # sub-collection → recurse one level
            if etype in ("playlist", "url_transparent") and depth < 1:
                sub_url = e.get("url") or e.get("webpage_url")
                if sub_url:
                    try:
                        info = ydl.extract_info(sub_url, download=False)
                        if info:
                            _collect(info.get("entries", []), ydl, depth + 1)
                    except Exception as ex:
                        logger.debug("sub-collection error: %s", ex)
                continue
            entry = _normalize(e)
            if entry and entry["id"] not in seen:
                seen.add(entry["id"])
                concerts.append(entry)
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+    for c in results:
-        try:
+        if c and c["title"]:
-            info = ydl.extract_info(ARTE_CONCERT_URL, download=False)
+            concerts.append(c)
            if info:
                _collect(info.get("entries", []), ydl)
        except Exception as ex:
            logger.error("fetch error: %s", ex)
    concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True)
    return concerts
-async def get_all_concerts() -> list:
+async def get_all_concerts() -> list[dict]:
    now = time.time()
    if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
        return _cache["data"]
    loop = asyncio.get_event_loop()
-    data = await loop.run_in_executor(None, _fetch_sync)
+    data = await loop.run_in_executor(None, _fetch_all_sync)
    if data:
        _cache["data"] = data
        _cache["ts"] = now
@@ -101,11 +122,15 @@ async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -
    filtered = all_c
    if search:
        q = search.lower()
-        filtered = [c for c in all_c if q in c["title"].lower() or q in c["description"].lower()]
+        filtered = [
            c for c in all_c
            if q in c["title"].lower()
            or q in c.get("subtitle", "").lower()
            or q in c.get("description", "").lower()
        ]
    start = (page - 1) * page_size
    page_data = filtered[start : start + page_size]
    return {
-        "concerts": page_data,
+        "concerts": filtered[start : start + page_size],
        "total": len(filtered),
        "page": page,
        "page_size": page_size,