fix: réécrire arte_api avec l'API player Arte et les pages RSC
Docker / docker (push) Successful in 1m21s

Abandon de l'approche yt-dlp playlist (URL non supportée).
Scrape les pages genre Arte Concert en RSC pour extraire les
programme IDs, puis fetch les métadonnées (titre, thumbnail,
durée, expiry) via l'API player v2 en parallèle (10 workers).
96 concerts disponibles.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
dev
2026-04-25 18:47:36 +02:00
parent eadc242173
commit 6bf6af4c73
+95 -70
View File
@@ -1,95 +1,116 @@
import asyncio
import logging
import re
import time
import yt_dlp
import logging
import asyncio
import urllib.request
import json
from concurrent.futures import ThreadPoolExecutor
logger = logging.getLogger(__name__)
CACHE_TTL = 6 * 3600
_cache: dict = {"data": [], "ts": 0}
ARTE_CONCERT_URL = "https://www.arte.tv/fr/videos/RC-014034/arte-concert/"
PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}"
GENRE_PAGES = [
("classique", "https://www.arte.tv/fr/arte-concert/classique/"),
("jazz", "https://www.arte.tv/fr/arte-concert/jazz/"),
("rock-pop", "https://www.arte.tv/fr/arte-concert/rock-pop/"),
("opéra", "https://www.arte.tv/fr/arte-concert/opera/"),
("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"),
("électronique", "https://www.arte.tv/fr/arte-concert/electronica/"),
("agenda", "https://www.arte.tv/fr/arte-concert/agenda/"),
("", "https://www.arte.tv/fr/arte-concert/"),
]
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"Chrome/120.0.0.0 Safari/537.36"
),
"RSC": "1",
}
_PROG_RE = re.compile(r"\b(\d{6}-\d{3}-[A-Z])\b")
def _best_thumbnail(entry: dict) -> str:
thumbs = entry.get("thumbnails") or []
if thumbs:
# prefer largest
sorted_thumbs = sorted(thumbs, key=lambda t: t.get("width", 0), reverse=True)
return sorted_thumbs[0].get("url", "")
return entry.get("thumbnail", "")
def _fetch_url(url: str, headers: dict | None = None) -> str:
req = urllib.request.Request(url, headers=headers or _HEADERS)
with urllib.request.urlopen(req, timeout=15) as r:
return r.read().decode("utf-8", errors="replace")
def _normalize(e: dict) -> dict | None:
if not e or not e.get("id"):
return None
video_id = e.get("id", "")
url = (
e.get("url")
or e.get("webpage_url")
or f"https://www.arte.tv/fr/videos/{video_id}/"
def _prog_ids_from_page(url: str) -> set[str]:
try:
html = _fetch_url(url)
return set(_PROG_RE.findall(html))
except Exception as ex:
logger.warning("Failed to fetch %s: %s", url, ex)
return set()
def _metadata_for_pid(pid: str) -> dict | None:
try:
raw = _fetch_url(
PLAYER_API.format(pid=pid),
headers={"User-Agent": _HEADERS["User-Agent"], "Accept": "application/json"},
)
data = json.loads(raw)
attrs = data["data"]["attributes"]
meta = attrs["metadata"]
url = (meta.get("link") or {}).get("url") or f"https://www.arte.tv/fr/videos/{pid}/"
imgs = meta.get("images") or []
thumbnail = imgs[0]["url"] if imgs else ""
duration_s = (meta.get("duration") or {}).get("seconds")
rights = attrs.get("rights") or {}
return {
"id": video_id,
"title": e.get("title", ""),
"id": pid,
"title": meta.get("title", ""),
"subtitle": meta.get("subtitle", ""),
"url": url,
"thumbnail": _best_thumbnail(e),
"duration": e.get("duration"),
"description": e.get("description", ""),
"upload_date": e.get("upload_date", ""),
"release_timestamp": e.get("release_timestamp"),
"thumbnail": thumbnail,
"duration": duration_s,
"description": meta.get("description", "") or "",
"expiry": rights.get("end", ""),
}
def _fetch_sync() -> list:
concerts: list = []
seen: set = set()
ydl_opts = {
"quiet": True,
"no_warnings": True,
"extract_flat": True,
"ignoreerrors": True,
}
def _collect(entries: list, ydl, depth: int = 0):
for e in entries or []:
if not e:
continue
etype = e.get("_type", "")
# sub-collection → recurse one level
if etype in ("playlist", "url_transparent") and depth < 1:
sub_url = e.get("url") or e.get("webpage_url")
if sub_url:
try:
info = ydl.extract_info(sub_url, download=False)
if info:
_collect(info.get("entries", []), ydl, depth + 1)
except Exception as ex:
logger.debug("sub-collection error: %s", ex)
continue
entry = _normalize(e)
if entry and entry["id"] not in seen:
seen.add(entry["id"])
concerts.append(entry)
logger.debug("Failed to get metadata for %s: %s", pid, ex)
return None
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(ARTE_CONCERT_URL, download=False)
if info:
_collect(info.get("entries", []), ydl)
except Exception as ex:
logger.error("fetch error: %s", ex)
def _fetch_all_sync() -> list[dict]:
# 1 — collect programme IDs across all genre pages
all_ids: set[str] = set()
for _genre, url in GENRE_PAGES:
ids = _prog_ids_from_page(url)
logger.info(" %s%d IDs", url.split("/fr/")[1], len(ids))
all_ids |= ids
logger.info("Total unique programme IDs: %d", len(all_ids))
# 2 — fetch metadata concurrently
concerts: list[dict] = []
with ThreadPoolExecutor(max_workers=10) as pool:
results = list(pool.map(_metadata_for_pid, sorted(all_ids)))
for c in results:
if c and c["title"]:
concerts.append(c)
concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True)
return concerts
async def get_all_concerts() -> list:
async def get_all_concerts() -> list[dict]:
now = time.time()
if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
return _cache["data"]
loop = asyncio.get_event_loop()
data = await loop.run_in_executor(None, _fetch_sync)
data = await loop.run_in_executor(None, _fetch_all_sync)
if data:
_cache["data"] = data
_cache["ts"] = now
@@ -101,11 +122,15 @@ async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -
filtered = all_c
if search:
q = search.lower()
filtered = [c for c in all_c if q in c["title"].lower() or q in c["description"].lower()]
filtered = [
c for c in all_c
if q in c["title"].lower()
or q in c.get("subtitle", "").lower()
or q in c.get("description", "").lower()
]
start = (page - 1) * page_size
page_data = filtered[start : start + page_size]
return {
"concerts": page_data,
"concerts": filtered[start : start + page_size],
"total": len(filtered),
"page": page,
"page_size": page_size,