feat: recherche hybride — cache local + API search Arte
Docker / docker (push) Successful in 1m18s

Quand la recherche ne trouve rien en local (ex: rock-pop chargé
côté client), interroge l'API search d'Arte, récupère les IDs
manquants et les résout via le player API en parallèle.
Permet de trouver n'importe quel concert présent sur arte-concert.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
dev
2026-04-25 18:53:17 +02:00
parent 47ba12b8ec
commit ca3ab37f19
+56 -30
View File
@@ -5,6 +5,7 @@ import asyncio
import urllib.request import urllib.request
import json import json
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from urllib.parse import quote_plus
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -12,16 +13,17 @@ CACHE_TTL = 6 * 3600
_cache: dict = {"data": [], "ts": 0} _cache: dict = {"data": [], "ts": 0}
PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}" PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}"
SEARCH_URL = "https://www.arte.tv/fr/search/?q={q}"
GENRE_PAGES = [ GENRE_PAGES = [
("classique", "https://www.arte.tv/fr/arte-concert/classique/"), "https://www.arte.tv/fr/arte-concert/classique/",
("jazz", "https://www.arte.tv/fr/arte-concert/jazz/"), "https://www.arte.tv/fr/arte-concert/jazz/",
("rock-pop", "https://www.arte.tv/fr/arte-concert/rock-pop/"), "https://www.arte.tv/fr/arte-concert/rock-pop/",
("opéra", "https://www.arte.tv/fr/arte-concert/opera/"), "https://www.arte.tv/fr/arte-concert/opera/",
("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"), "https://www.arte.tv/fr/arte-concert/musique-du-monde/",
("électronique", "https://www.arte.tv/fr/arte-concert/electronica/"), "https://www.arte.tv/fr/arte-concert/electronica/",
("agenda", "https://www.arte.tv/fr/arte-concert/agenda/"), "https://www.arte.tv/fr/arte-concert/agenda/",
("", "https://www.arte.tv/fr/arte-concert/"), "https://www.arte.tv/fr/arte-concert/",
] ]
_HEADERS = { _HEADERS = {
@@ -43,8 +45,7 @@ def _fetch_url(url: str, headers: dict | None = None) -> str:
def _prog_ids_from_page(url: str) -> set[str]: def _prog_ids_from_page(url: str) -> set[str]:
try: try:
html = _fetch_url(url) return set(_PROG_RE.findall(_fetch_url(url)))
return set(_PROG_RE.findall(html))
except Exception as ex: except Exception as ex:
logger.warning("Failed to fetch %s: %s", url, ex) logger.warning("Failed to fetch %s: %s", url, ex)
return set() return set()
@@ -68,13 +69,13 @@ def _metadata_for_pid(pid: str) -> dict | None:
return { return {
"id": pid, "id": pid,
"title": meta.get("title", ""), "title": meta.get("title") or "",
"subtitle": meta.get("subtitle", ""), "subtitle": meta.get("subtitle") or "",
"url": url, "url": url,
"thumbnail": thumbnail, "thumbnail": thumbnail,
"duration": duration_s, "duration": duration_s,
"description": meta.get("description", "") or "", "description": meta.get("description") or "",
"expiry": rights.get("end", ""), "expiry": rights.get("end") or "",
} }
except Exception as ex: except Exception as ex:
logger.debug("Failed to get metadata for %s: %s", pid, ex) logger.debug("Failed to get metadata for %s: %s", pid, ex)
@@ -82,33 +83,41 @@ def _metadata_for_pid(pid: str) -> dict | None:
def _fetch_all_sync() -> list[dict]: def _fetch_all_sync() -> list[dict]:
# 1 — collect programme IDs across all genre pages
all_ids: set[str] = set() all_ids: set[str] = set()
for _genre, url in GENRE_PAGES: for url in GENRE_PAGES:
ids = _prog_ids_from_page(url) ids = _prog_ids_from_page(url)
logger.info(" %s%d IDs", url.split("/fr/")[1], len(ids)) logger.info(" %s%d IDs", url.split("/fr/")[1], len(ids))
all_ids |= ids all_ids |= ids
logger.info("Total unique programme IDs: %d", len(all_ids)) logger.info("Total unique programme IDs: %d", len(all_ids))
# 2 — fetch metadata concurrently concerts = _resolve_ids(all_ids)
concerts: list[dict] = [] concerts.sort(key=lambda c: c.get("expiry") or "", reverse=True)
with ThreadPoolExecutor(max_workers=10) as pool:
results = list(pool.map(_metadata_for_pid, sorted(all_ids)))
for c in results:
if c and c["title"]:
concerts.append(c)
concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True)
return concerts return concerts
def _resolve_ids(ids: set[str], exclude: set[str] | None = None) -> list[dict]:
to_fetch = ids - (exclude or set())
with ThreadPoolExecutor(max_workers=10) as pool:
results = list(pool.map(_metadata_for_pid, sorted(to_fetch)))
return [c for c in results if c and c.get("title")]
def _search_sync(query: str) -> set[str]:
url = SEARCH_URL.format(q=quote_plus(query))
try:
html = _fetch_url(url)
return set(_PROG_RE.findall(html))
except Exception as ex:
logger.warning("Search failed for %r: %s", query, ex)
return set()
# ── public API ────────────────────────────────────────────────────────────────
async def get_all_concerts() -> list[dict]: async def get_all_concerts() -> list[dict]:
now = time.time() now = time.time()
if _cache["data"] and now - _cache["ts"] < CACHE_TTL: if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
return _cache["data"] return _cache["data"]
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
data = await loop.run_in_executor(None, _fetch_all_sync) data = await loop.run_in_executor(None, _fetch_all_sync)
if data: if data:
@@ -119,15 +128,32 @@ async def get_all_concerts() -> list[dict]:
async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -> dict: async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -> dict:
all_c = await get_all_concerts() all_c = await get_all_concerts()
filtered = all_c cached_ids = {c["id"] for c in all_c}
if search: if search:
q = search.lower() q = search.lower()
filtered = [ # local filter
local = [
c for c in all_c c for c in all_c
if q in (c.get("title") or "").lower() if q in (c.get("title") or "").lower()
or q in (c.get("subtitle") or "").lower() or q in (c.get("subtitle") or "").lower()
or q in (c.get("description") or "").lower() or q in (c.get("description") or "").lower()
] ]
# Arte search for IDs not in cache
loop = asyncio.get_event_loop()
remote_ids = await loop.run_in_executor(None, _search_sync, search)
new_ids = remote_ids - cached_ids
if new_ids:
extra = await loop.run_in_executor(None, _resolve_ids, new_ids, None)
# merge: local results first, then extras not already present
local_ids = {c["id"] for c in local}
for c in extra:
if c["id"] not in local_ids:
local.append(c)
filtered = local
else:
filtered = all_c
start = (page - 1) * page_size start = (page - 1) * page_size
return { return {
"concerts": filtered[start : start + page_size], "concerts": filtered[start : start + page_size],