From ca3ab37f199a77ac43ae45eb36655d1b54cb8fb4 Mon Sep 17 00:00:00 2001 From: dev Date: Sat, 25 Apr 2026 18:53:17 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20recherche=20hybride=20=E2=80=94=20cache?= =?UTF-8?q?=20local=20+=20API=20search=20Arte?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quand la recherche ne trouve rien en local (ex: rock-pop chargé côté client), interroge l'API search d'Arte, récupère les IDs manquants et les résout via le player API en parallèle. Permet de trouver n'importe quel concert présent sur arte-concert. Co-Authored-By: Claude Sonnet 4.6 --- arte_api.py | 86 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/arte_api.py b/arte_api.py index 9559f54..0f7141c 100644 --- a/arte_api.py +++ b/arte_api.py @@ -5,6 +5,7 @@ import asyncio import urllib.request import json from concurrent.futures import ThreadPoolExecutor +from urllib.parse import quote_plus logger = logging.getLogger(__name__) @@ -12,16 +13,17 @@ CACHE_TTL = 6 * 3600 _cache: dict = {"data": [], "ts": 0} PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}" +SEARCH_URL = "https://www.arte.tv/fr/search/?q={q}" GENRE_PAGES = [ - ("classique", "https://www.arte.tv/fr/arte-concert/classique/"), - ("jazz", "https://www.arte.tv/fr/arte-concert/jazz/"), - ("rock-pop", "https://www.arte.tv/fr/arte-concert/rock-pop/"), - ("opéra", "https://www.arte.tv/fr/arte-concert/opera/"), - ("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"), - ("électronique", "https://www.arte.tv/fr/arte-concert/electronica/"), - ("agenda", "https://www.arte.tv/fr/arte-concert/agenda/"), - ("", "https://www.arte.tv/fr/arte-concert/"), + "https://www.arte.tv/fr/arte-concert/classique/", + "https://www.arte.tv/fr/arte-concert/jazz/", + "https://www.arte.tv/fr/arte-concert/rock-pop/", + "https://www.arte.tv/fr/arte-concert/opera/", + "https://www.arte.tv/fr/arte-concert/musique-du-monde/", + "https://www.arte.tv/fr/arte-concert/electronica/", + "https://www.arte.tv/fr/arte-concert/agenda/", + "https://www.arte.tv/fr/arte-concert/", ] _HEADERS = { @@ -43,8 +45,7 @@ def _fetch_url(url: str, headers: dict | None = None) -> str: def _prog_ids_from_page(url: str) -> set[str]: try: - html = _fetch_url(url) - return set(_PROG_RE.findall(html)) + return set(_PROG_RE.findall(_fetch_url(url))) except Exception as ex: logger.warning("Failed to fetch %s: %s", url, ex) return set() @@ -68,13 +69,13 @@ def _metadata_for_pid(pid: str) -> dict | None: return { "id": pid, - "title": meta.get("title", ""), - "subtitle": meta.get("subtitle", ""), + "title": meta.get("title") or "", + "subtitle": meta.get("subtitle") or "", "url": url, "thumbnail": thumbnail, "duration": duration_s, - "description": meta.get("description", "") or "", - "expiry": rights.get("end", ""), + "description": meta.get("description") or "", + "expiry": rights.get("end") or "", } except Exception as ex: logger.debug("Failed to get metadata for %s: %s", pid, ex) @@ -82,33 +83,41 @@ def _metadata_for_pid(pid: str) -> dict | None: def _fetch_all_sync() -> list[dict]: - # 1 — collect programme IDs across all genre pages all_ids: set[str] = set() - for _genre, url in GENRE_PAGES: + for url in GENRE_PAGES: ids = _prog_ids_from_page(url) logger.info(" %s → %d IDs", url.split("/fr/")[1], len(ids)) all_ids |= ids - logger.info("Total unique programme IDs: %d", len(all_ids)) - # 2 — fetch metadata concurrently - concerts: list[dict] = [] - with ThreadPoolExecutor(max_workers=10) as pool: - results = list(pool.map(_metadata_for_pid, sorted(all_ids))) - - for c in results: - if c and c["title"]: - concerts.append(c) - - concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True) + concerts = _resolve_ids(all_ids) + concerts.sort(key=lambda c: c.get("expiry") or "", reverse=True) return concerts +def _resolve_ids(ids: set[str], exclude: set[str] | None = None) -> list[dict]: + to_fetch = ids - (exclude or set()) + with ThreadPoolExecutor(max_workers=10) as pool: + results = list(pool.map(_metadata_for_pid, sorted(to_fetch))) + return [c for c in results if c and c.get("title")] + + +def _search_sync(query: str) -> set[str]: + url = SEARCH_URL.format(q=quote_plus(query)) + try: + html = _fetch_url(url) + return set(_PROG_RE.findall(html)) + except Exception as ex: + logger.warning("Search failed for %r: %s", query, ex) + return set() + + +# ── public API ──────────────────────────────────────────────────────────────── + async def get_all_concerts() -> list[dict]: now = time.time() if _cache["data"] and now - _cache["ts"] < CACHE_TTL: return _cache["data"] - loop = asyncio.get_event_loop() data = await loop.run_in_executor(None, _fetch_all_sync) if data: @@ -119,15 +128,32 @@ async def get_all_concerts() -> list[dict]: async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -> dict: all_c = await get_all_concerts() - filtered = all_c + cached_ids = {c["id"] for c in all_c} + if search: q = search.lower() - filtered = [ + # local filter + local = [ c for c in all_c if q in (c.get("title") or "").lower() or q in (c.get("subtitle") or "").lower() or q in (c.get("description") or "").lower() ] + # Arte search for IDs not in cache + loop = asyncio.get_event_loop() + remote_ids = await loop.run_in_executor(None, _search_sync, search) + new_ids = remote_ids - cached_ids + if new_ids: + extra = await loop.run_in_executor(None, _resolve_ids, new_ids, None) + # merge: local results first, then extras not already present + local_ids = {c["id"] for c in local} + for c in extra: + if c["id"] not in local_ids: + local.append(c) + filtered = local + else: + filtered = all_c + start = (page - 1) * page_size return { "concerts": filtered[start : start + page_size],