fix: réécrire arte_api avec l'API player Arte et les pages RSC
Docker / docker (push) Successful in 1m21s
Docker / docker (push) Successful in 1m21s
Abandon de l'approche yt-dlp playlist (URL non supportée). Scrape les pages genre Arte Concert en RSC pour extraire les programme IDs, puis fetch les métadonnées (titre, thumbnail, durée, expiry) via l'API player v2 en parallèle (10 workers). 96 concerts disponibles. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+95
-70
@@ -1,95 +1,116 @@
|
|||||||
import asyncio
|
import re
|
||||||
import logging
|
|
||||||
import time
|
import time
|
||||||
import yt_dlp
|
import logging
|
||||||
|
import asyncio
|
||||||
|
import urllib.request
|
||||||
|
import json
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
CACHE_TTL = 6 * 3600
|
CACHE_TTL = 6 * 3600
|
||||||
_cache: dict = {"data": [], "ts": 0}
|
_cache: dict = {"data": [], "ts": 0}
|
||||||
|
|
||||||
ARTE_CONCERT_URL = "https://www.arte.tv/fr/videos/RC-014034/arte-concert/"
|
PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}"
|
||||||
|
|
||||||
|
GENRE_PAGES = [
|
||||||
|
("classique", "https://www.arte.tv/fr/arte-concert/classique/"),
|
||||||
|
("jazz", "https://www.arte.tv/fr/arte-concert/jazz/"),
|
||||||
|
("rock-pop", "https://www.arte.tv/fr/arte-concert/rock-pop/"),
|
||||||
|
("opéra", "https://www.arte.tv/fr/arte-concert/opera/"),
|
||||||
|
("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"),
|
||||||
|
("électronique", "https://www.arte.tv/fr/arte-concert/electronica/"),
|
||||||
|
("agenda", "https://www.arte.tv/fr/arte-concert/agenda/"),
|
||||||
|
("", "https://www.arte.tv/fr/arte-concert/"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"RSC": "1",
|
||||||
|
}
|
||||||
|
|
||||||
|
_PROG_RE = re.compile(r"\b(\d{6}-\d{3}-[A-Z])\b")
|
||||||
|
|
||||||
|
|
||||||
def _best_thumbnail(entry: dict) -> str:
|
def _fetch_url(url: str, headers: dict | None = None) -> str:
|
||||||
thumbs = entry.get("thumbnails") or []
|
req = urllib.request.Request(url, headers=headers or _HEADERS)
|
||||||
if thumbs:
|
with urllib.request.urlopen(req, timeout=15) as r:
|
||||||
# prefer largest
|
return r.read().decode("utf-8", errors="replace")
|
||||||
sorted_thumbs = sorted(thumbs, key=lambda t: t.get("width", 0), reverse=True)
|
|
||||||
return sorted_thumbs[0].get("url", "")
|
|
||||||
return entry.get("thumbnail", "")
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize(e: dict) -> dict | None:
|
def _prog_ids_from_page(url: str) -> set[str]:
|
||||||
if not e or not e.get("id"):
|
try:
|
||||||
return None
|
html = _fetch_url(url)
|
||||||
video_id = e.get("id", "")
|
return set(_PROG_RE.findall(html))
|
||||||
url = (
|
except Exception as ex:
|
||||||
e.get("url")
|
logger.warning("Failed to fetch %s: %s", url, ex)
|
||||||
or e.get("webpage_url")
|
return set()
|
||||||
or f"https://www.arte.tv/fr/videos/{video_id}/"
|
|
||||||
|
|
||||||
|
def _metadata_for_pid(pid: str) -> dict | None:
|
||||||
|
try:
|
||||||
|
raw = _fetch_url(
|
||||||
|
PLAYER_API.format(pid=pid),
|
||||||
|
headers={"User-Agent": _HEADERS["User-Agent"], "Accept": "application/json"},
|
||||||
)
|
)
|
||||||
|
data = json.loads(raw)
|
||||||
|
attrs = data["data"]["attributes"]
|
||||||
|
meta = attrs["metadata"]
|
||||||
|
|
||||||
|
url = (meta.get("link") or {}).get("url") or f"https://www.arte.tv/fr/videos/{pid}/"
|
||||||
|
imgs = meta.get("images") or []
|
||||||
|
thumbnail = imgs[0]["url"] if imgs else ""
|
||||||
|
duration_s = (meta.get("duration") or {}).get("seconds")
|
||||||
|
rights = attrs.get("rights") or {}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"id": video_id,
|
"id": pid,
|
||||||
"title": e.get("title", ""),
|
"title": meta.get("title", ""),
|
||||||
|
"subtitle": meta.get("subtitle", ""),
|
||||||
"url": url,
|
"url": url,
|
||||||
"thumbnail": _best_thumbnail(e),
|
"thumbnail": thumbnail,
|
||||||
"duration": e.get("duration"),
|
"duration": duration_s,
|
||||||
"description": e.get("description", ""),
|
"description": meta.get("description", "") or "",
|
||||||
"upload_date": e.get("upload_date", ""),
|
"expiry": rights.get("end", ""),
|
||||||
"release_timestamp": e.get("release_timestamp"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _fetch_sync() -> list:
|
|
||||||
concerts: list = []
|
|
||||||
seen: set = set()
|
|
||||||
|
|
||||||
ydl_opts = {
|
|
||||||
"quiet": True,
|
|
||||||
"no_warnings": True,
|
|
||||||
"extract_flat": True,
|
|
||||||
"ignoreerrors": True,
|
|
||||||
}
|
|
||||||
|
|
||||||
def _collect(entries: list, ydl, depth: int = 0):
|
|
||||||
for e in entries or []:
|
|
||||||
if not e:
|
|
||||||
continue
|
|
||||||
etype = e.get("_type", "")
|
|
||||||
# sub-collection → recurse one level
|
|
||||||
if etype in ("playlist", "url_transparent") and depth < 1:
|
|
||||||
sub_url = e.get("url") or e.get("webpage_url")
|
|
||||||
if sub_url:
|
|
||||||
try:
|
|
||||||
info = ydl.extract_info(sub_url, download=False)
|
|
||||||
if info:
|
|
||||||
_collect(info.get("entries", []), ydl, depth + 1)
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.debug("sub-collection error: %s", ex)
|
logger.debug("Failed to get metadata for %s: %s", pid, ex)
|
||||||
continue
|
return None
|
||||||
entry = _normalize(e)
|
|
||||||
if entry and entry["id"] not in seen:
|
|
||||||
seen.add(entry["id"])
|
|
||||||
concerts.append(entry)
|
|
||||||
|
|
||||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
||||||
try:
|
|
||||||
info = ydl.extract_info(ARTE_CONCERT_URL, download=False)
|
|
||||||
if info:
|
|
||||||
_collect(info.get("entries", []), ydl)
|
|
||||||
except Exception as ex:
|
|
||||||
logger.error("fetch error: %s", ex)
|
|
||||||
|
|
||||||
|
def _fetch_all_sync() -> list[dict]:
|
||||||
|
# 1 — collect programme IDs across all genre pages
|
||||||
|
all_ids: set[str] = set()
|
||||||
|
for _genre, url in GENRE_PAGES:
|
||||||
|
ids = _prog_ids_from_page(url)
|
||||||
|
logger.info(" %s → %d IDs", url.split("/fr/")[1], len(ids))
|
||||||
|
all_ids |= ids
|
||||||
|
|
||||||
|
logger.info("Total unique programme IDs: %d", len(all_ids))
|
||||||
|
|
||||||
|
# 2 — fetch metadata concurrently
|
||||||
|
concerts: list[dict] = []
|
||||||
|
with ThreadPoolExecutor(max_workers=10) as pool:
|
||||||
|
results = list(pool.map(_metadata_for_pid, sorted(all_ids)))
|
||||||
|
|
||||||
|
for c in results:
|
||||||
|
if c and c["title"]:
|
||||||
|
concerts.append(c)
|
||||||
|
|
||||||
|
concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True)
|
||||||
return concerts
|
return concerts
|
||||||
|
|
||||||
|
|
||||||
async def get_all_concerts() -> list:
|
async def get_all_concerts() -> list[dict]:
|
||||||
now = time.time()
|
now = time.time()
|
||||||
if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
|
if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
|
||||||
return _cache["data"]
|
return _cache["data"]
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
data = await loop.run_in_executor(None, _fetch_sync)
|
data = await loop.run_in_executor(None, _fetch_all_sync)
|
||||||
if data:
|
if data:
|
||||||
_cache["data"] = data
|
_cache["data"] = data
|
||||||
_cache["ts"] = now
|
_cache["ts"] = now
|
||||||
@@ -101,11 +122,15 @@ async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -
|
|||||||
filtered = all_c
|
filtered = all_c
|
||||||
if search:
|
if search:
|
||||||
q = search.lower()
|
q = search.lower()
|
||||||
filtered = [c for c in all_c if q in c["title"].lower() or q in c["description"].lower()]
|
filtered = [
|
||||||
|
c for c in all_c
|
||||||
|
if q in c["title"].lower()
|
||||||
|
or q in c.get("subtitle", "").lower()
|
||||||
|
or q in c.get("description", "").lower()
|
||||||
|
]
|
||||||
start = (page - 1) * page_size
|
start = (page - 1) * page_size
|
||||||
page_data = filtered[start : start + page_size]
|
|
||||||
return {
|
return {
|
||||||
"concerts": page_data,
|
"concerts": filtered[start : start + page_size],
|
||||||
"total": len(filtered),
|
"total": len(filtered),
|
||||||
"page": page,
|
"page": page,
|
||||||
"page_size": page_size,
|
"page_size": page_size,
|
||||||
|
|||||||
Reference in New Issue
Block a user