6bf6af4c73
Docker / docker (push) Successful in 1m21s
Abandon de l'approche yt-dlp playlist (URL non supportée). Scrape les pages genre Arte Concert en RSC pour extraire les programme IDs, puis fetch les métadonnées (titre, thumbnail, durée, expiry) via l'API player v2 en parallèle (10 workers). 96 concerts disponibles. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
145 lines
4.4 KiB
Python
145 lines
4.4 KiB
Python
import re
|
|
import time
|
|
import logging
|
|
import asyncio
|
|
import urllib.request
|
|
import json
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CACHE_TTL = 6 * 3600
|
|
_cache: dict = {"data": [], "ts": 0}
|
|
|
|
PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}"
|
|
|
|
GENRE_PAGES = [
|
|
("classique", "https://www.arte.tv/fr/arte-concert/classique/"),
|
|
("jazz", "https://www.arte.tv/fr/arte-concert/jazz/"),
|
|
("rock-pop", "https://www.arte.tv/fr/arte-concert/rock-pop/"),
|
|
("opéra", "https://www.arte.tv/fr/arte-concert/opera/"),
|
|
("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"),
|
|
("électronique", "https://www.arte.tv/fr/arte-concert/electronica/"),
|
|
("agenda", "https://www.arte.tv/fr/arte-concert/agenda/"),
|
|
("", "https://www.arte.tv/fr/arte-concert/"),
|
|
]
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"RSC": "1",
|
|
}
|
|
|
|
_PROG_RE = re.compile(r"\b(\d{6}-\d{3}-[A-Z])\b")
|
|
|
|
|
|
def _fetch_url(url: str, headers: dict | None = None) -> str:
|
|
req = urllib.request.Request(url, headers=headers or _HEADERS)
|
|
with urllib.request.urlopen(req, timeout=15) as r:
|
|
return r.read().decode("utf-8", errors="replace")
|
|
|
|
|
|
def _prog_ids_from_page(url: str) -> set[str]:
|
|
try:
|
|
html = _fetch_url(url)
|
|
return set(_PROG_RE.findall(html))
|
|
except Exception as ex:
|
|
logger.warning("Failed to fetch %s: %s", url, ex)
|
|
return set()
|
|
|
|
|
|
def _metadata_for_pid(pid: str) -> dict | None:
|
|
try:
|
|
raw = _fetch_url(
|
|
PLAYER_API.format(pid=pid),
|
|
headers={"User-Agent": _HEADERS["User-Agent"], "Accept": "application/json"},
|
|
)
|
|
data = json.loads(raw)
|
|
attrs = data["data"]["attributes"]
|
|
meta = attrs["metadata"]
|
|
|
|
url = (meta.get("link") or {}).get("url") or f"https://www.arte.tv/fr/videos/{pid}/"
|
|
imgs = meta.get("images") or []
|
|
thumbnail = imgs[0]["url"] if imgs else ""
|
|
duration_s = (meta.get("duration") or {}).get("seconds")
|
|
rights = attrs.get("rights") or {}
|
|
|
|
return {
|
|
"id": pid,
|
|
"title": meta.get("title", ""),
|
|
"subtitle": meta.get("subtitle", ""),
|
|
"url": url,
|
|
"thumbnail": thumbnail,
|
|
"duration": duration_s,
|
|
"description": meta.get("description", "") or "",
|
|
"expiry": rights.get("end", ""),
|
|
}
|
|
except Exception as ex:
|
|
logger.debug("Failed to get metadata for %s: %s", pid, ex)
|
|
return None
|
|
|
|
|
|
def _fetch_all_sync() -> list[dict]:
|
|
# 1 — collect programme IDs across all genre pages
|
|
all_ids: set[str] = set()
|
|
for _genre, url in GENRE_PAGES:
|
|
ids = _prog_ids_from_page(url)
|
|
logger.info(" %s → %d IDs", url.split("/fr/")[1], len(ids))
|
|
all_ids |= ids
|
|
|
|
logger.info("Total unique programme IDs: %d", len(all_ids))
|
|
|
|
# 2 — fetch metadata concurrently
|
|
concerts: list[dict] = []
|
|
with ThreadPoolExecutor(max_workers=10) as pool:
|
|
results = list(pool.map(_metadata_for_pid, sorted(all_ids)))
|
|
|
|
for c in results:
|
|
if c and c["title"]:
|
|
concerts.append(c)
|
|
|
|
concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True)
|
|
return concerts
|
|
|
|
|
|
async def get_all_concerts() -> list[dict]:
|
|
now = time.time()
|
|
if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
|
|
return _cache["data"]
|
|
|
|
loop = asyncio.get_event_loop()
|
|
data = await loop.run_in_executor(None, _fetch_all_sync)
|
|
if data:
|
|
_cache["data"] = data
|
|
_cache["ts"] = now
|
|
return _cache["data"]
|
|
|
|
|
|
async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -> dict:
|
|
all_c = await get_all_concerts()
|
|
filtered = all_c
|
|
if search:
|
|
q = search.lower()
|
|
filtered = [
|
|
c for c in all_c
|
|
if q in c["title"].lower()
|
|
or q in c.get("subtitle", "").lower()
|
|
or q in c.get("description", "").lower()
|
|
]
|
|
start = (page - 1) * page_size
|
|
return {
|
|
"concerts": filtered[start : start + page_size],
|
|
"total": len(filtered),
|
|
"page": page,
|
|
"page_size": page_size,
|
|
"pages": max(1, (len(filtered) + page_size - 1) // page_size),
|
|
}
|
|
|
|
|
|
async def invalidate_cache() -> int:
|
|
_cache["ts"] = 0
|
|
data = await get_all_concerts()
|
|
return len(data)
|