From 6bf6af4c734924ce55163e7e238c6ce17e0fbae3 Mon Sep 17 00:00:00 2001 From: dev Date: Sat, 25 Apr 2026 18:47:36 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20r=C3=A9=C3=A9crire=20arte=5Fapi=20avec?= =?UTF-8?q?=20l'API=20player=20Arte=20et=20les=20pages=20RSC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abandon de l'approche yt-dlp playlist (URL non supportée). Scrape les pages genre Arte Concert en RSC pour extraire les programme IDs, puis fetch les métadonnées (titre, thumbnail, durée, expiry) via l'API player v2 en parallèle (10 workers). 96 concerts disponibles. Co-Authored-By: Claude Sonnet 4.6 --- arte_api.py | 165 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 95 insertions(+), 70 deletions(-) diff --git a/arte_api.py b/arte_api.py index a865e12..cdb7883 100644 --- a/arte_api.py +++ b/arte_api.py @@ -1,95 +1,116 @@ -import asyncio -import logging +import re import time -import yt_dlp +import logging +import asyncio +import urllib.request +import json +from concurrent.futures import ThreadPoolExecutor logger = logging.getLogger(__name__) CACHE_TTL = 6 * 3600 _cache: dict = {"data": [], "ts": 0} -ARTE_CONCERT_URL = "https://www.arte.tv/fr/videos/RC-014034/arte-concert/" +PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}" + +GENRE_PAGES = [ + ("classique", "https://www.arte.tv/fr/arte-concert/classique/"), + ("jazz", "https://www.arte.tv/fr/arte-concert/jazz/"), + ("rock-pop", "https://www.arte.tv/fr/arte-concert/rock-pop/"), + ("opéra", "https://www.arte.tv/fr/arte-concert/opera/"), + ("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"), + ("électronique", "https://www.arte.tv/fr/arte-concert/electronica/"), + ("agenda", "https://www.arte.tv/fr/arte-concert/agenda/"), + ("", "https://www.arte.tv/fr/arte-concert/"), +] + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "Chrome/120.0.0.0 Safari/537.36" + ), + "RSC": "1", +} + +_PROG_RE = re.compile(r"\b(\d{6}-\d{3}-[A-Z])\b") -def _best_thumbnail(entry: dict) -> str: - thumbs = entry.get("thumbnails") or [] - if thumbs: - # prefer largest - sorted_thumbs = sorted(thumbs, key=lambda t: t.get("width", 0), reverse=True) - return sorted_thumbs[0].get("url", "") - return entry.get("thumbnail", "") +def _fetch_url(url: str, headers: dict | None = None) -> str: + req = urllib.request.Request(url, headers=headers or _HEADERS) + with urllib.request.urlopen(req, timeout=15) as r: + return r.read().decode("utf-8", errors="replace") -def _normalize(e: dict) -> dict | None: - if not e or not e.get("id"): +def _prog_ids_from_page(url: str) -> set[str]: + try: + html = _fetch_url(url) + return set(_PROG_RE.findall(html)) + except Exception as ex: + logger.warning("Failed to fetch %s: %s", url, ex) + return set() + + +def _metadata_for_pid(pid: str) -> dict | None: + try: + raw = _fetch_url( + PLAYER_API.format(pid=pid), + headers={"User-Agent": _HEADERS["User-Agent"], "Accept": "application/json"}, + ) + data = json.loads(raw) + attrs = data["data"]["attributes"] + meta = attrs["metadata"] + + url = (meta.get("link") or {}).get("url") or f"https://www.arte.tv/fr/videos/{pid}/" + imgs = meta.get("images") or [] + thumbnail = imgs[0]["url"] if imgs else "" + duration_s = (meta.get("duration") or {}).get("seconds") + rights = attrs.get("rights") or {} + + return { + "id": pid, + "title": meta.get("title", ""), + "subtitle": meta.get("subtitle", ""), + "url": url, + "thumbnail": thumbnail, + "duration": duration_s, + "description": meta.get("description", "") or "", + "expiry": rights.get("end", ""), + } + except Exception as ex: + logger.debug("Failed to get metadata for %s: %s", pid, ex) return None - video_id = e.get("id", "") - url = ( - e.get("url") - or e.get("webpage_url") - or f"https://www.arte.tv/fr/videos/{video_id}/" - ) - return { - "id": video_id, - "title": e.get("title", ""), - "url": url, - "thumbnail": _best_thumbnail(e), - "duration": e.get("duration"), - "description": e.get("description", ""), - "upload_date": e.get("upload_date", ""), - "release_timestamp": e.get("release_timestamp"), - } -def _fetch_sync() -> list: - concerts: list = [] - seen: set = set() +def _fetch_all_sync() -> list[dict]: + # 1 — collect programme IDs across all genre pages + all_ids: set[str] = set() + for _genre, url in GENRE_PAGES: + ids = _prog_ids_from_page(url) + logger.info(" %s → %d IDs", url.split("/fr/")[1], len(ids)) + all_ids |= ids - ydl_opts = { - "quiet": True, - "no_warnings": True, - "extract_flat": True, - "ignoreerrors": True, - } + logger.info("Total unique programme IDs: %d", len(all_ids)) - def _collect(entries: list, ydl, depth: int = 0): - for e in entries or []: - if not e: - continue - etype = e.get("_type", "") - # sub-collection → recurse one level - if etype in ("playlist", "url_transparent") and depth < 1: - sub_url = e.get("url") or e.get("webpage_url") - if sub_url: - try: - info = ydl.extract_info(sub_url, download=False) - if info: - _collect(info.get("entries", []), ydl, depth + 1) - except Exception as ex: - logger.debug("sub-collection error: %s", ex) - continue - entry = _normalize(e) - if entry and entry["id"] not in seen: - seen.add(entry["id"]) - concerts.append(entry) + # 2 — fetch metadata concurrently + concerts: list[dict] = [] + with ThreadPoolExecutor(max_workers=10) as pool: + results = list(pool.map(_metadata_for_pid, sorted(all_ids))) - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - try: - info = ydl.extract_info(ARTE_CONCERT_URL, download=False) - if info: - _collect(info.get("entries", []), ydl) - except Exception as ex: - logger.error("fetch error: %s", ex) + for c in results: + if c and c["title"]: + concerts.append(c) + concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True) return concerts -async def get_all_concerts() -> list: +async def get_all_concerts() -> list[dict]: now = time.time() if _cache["data"] and now - _cache["ts"] < CACHE_TTL: return _cache["data"] + loop = asyncio.get_event_loop() - data = await loop.run_in_executor(None, _fetch_sync) + data = await loop.run_in_executor(None, _fetch_all_sync) if data: _cache["data"] = data _cache["ts"] = now @@ -101,11 +122,15 @@ async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) - filtered = all_c if search: q = search.lower() - filtered = [c for c in all_c if q in c["title"].lower() or q in c["description"].lower()] + filtered = [ + c for c in all_c + if q in c["title"].lower() + or q in c.get("subtitle", "").lower() + or q in c.get("description", "").lower() + ] start = (page - 1) * page_size - page_data = filtered[start : start + page_size] return { - "concerts": page_data, + "concerts": filtered[start : start + page_size], "total": len(filtered), "page": page, "page_size": page_size,