From ca3ab37f199a77ac43ae45eb36655d1b54cb8fb4 Mon Sep 17 00:00:00 2001
From: dev <dev@remuxarr>
Date: Sat, 25 Apr 2026 18:53:17 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20recherche=20hybride=20=E2=80=94=20cache?=
 =?UTF-8?q?=20local=20+=20API=20search=20Arte?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quand la recherche ne trouve rien en local (ex: rock-pop chargé
côté client), interroge l'API search d'Arte, récupère les IDs
manquants et les résout via le player API en parallèle.
Permet de trouver n'importe quel concert présent sur arte-concert.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 arte_api.py | 86 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 30 deletions(-)

diff --git a/arte_api.py b/arte_api.py
index 9559f54..0f7141c 100644
--- a/arte_api.py
+++ b/arte_api.py
@@ -5,6 +5,7 @@ import asyncio
 import urllib.request
 import json
 from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import quote_plus
 
 logger = logging.getLogger(__name__)
 
@@ -12,16 +13,17 @@ CACHE_TTL = 6 * 3600
 _cache: dict = {"data": [], "ts": 0}
 
 PLAYER_API = "https://api.arte.tv/api/player/v2/config/fr/{pid}"
+SEARCH_URL = "https://www.arte.tv/fr/search/?q={q}"
 
 GENRE_PAGES = [
-    ("classique",       "https://www.arte.tv/fr/arte-concert/classique/"),
-    ("jazz",            "https://www.arte.tv/fr/arte-concert/jazz/"),
-    ("rock-pop",        "https://www.arte.tv/fr/arte-concert/rock-pop/"),
-    ("opéra",          "https://www.arte.tv/fr/arte-concert/opera/"),
-    ("musique du monde","https://www.arte.tv/fr/arte-concert/musique-du-monde/"),
-    ("électronique",   "https://www.arte.tv/fr/arte-concert/electronica/"),
-    ("agenda",          "https://www.arte.tv/fr/arte-concert/agenda/"),
-    ("",                "https://www.arte.tv/fr/arte-concert/"),
+    "https://www.arte.tv/fr/arte-concert/classique/",
+    "https://www.arte.tv/fr/arte-concert/jazz/",
+    "https://www.arte.tv/fr/arte-concert/rock-pop/",
+    "https://www.arte.tv/fr/arte-concert/opera/",
+    "https://www.arte.tv/fr/arte-concert/musique-du-monde/",
+    "https://www.arte.tv/fr/arte-concert/electronica/",
+    "https://www.arte.tv/fr/arte-concert/agenda/",
+    "https://www.arte.tv/fr/arte-concert/",
 ]
 
 _HEADERS = {
@@ -43,8 +45,7 @@ def _fetch_url(url: str, headers: dict | None = None) -> str:
 
 def _prog_ids_from_page(url: str) -> set[str]:
     try:
-        html = _fetch_url(url)
-        return set(_PROG_RE.findall(html))
+        return set(_PROG_RE.findall(_fetch_url(url)))
     except Exception as ex:
         logger.warning("Failed to fetch %s: %s", url, ex)
         return set()
@@ -68,13 +69,13 @@ def _metadata_for_pid(pid: str) -> dict | None:
 
         return {
             "id": pid,
-            "title": meta.get("title", ""),
-            "subtitle": meta.get("subtitle", ""),
+            "title": meta.get("title") or "",
+            "subtitle": meta.get("subtitle") or "",
             "url": url,
             "thumbnail": thumbnail,
             "duration": duration_s,
-            "description": meta.get("description", "") or "",
-            "expiry": rights.get("end", ""),
+            "description": meta.get("description") or "",
+            "expiry": rights.get("end") or "",
         }
     except Exception as ex:
         logger.debug("Failed to get metadata for %s: %s", pid, ex)
@@ -82,33 +83,41 @@ def _metadata_for_pid(pid: str) -> dict | None:
 
 
 def _fetch_all_sync() -> list[dict]:
-    # 1 — collect programme IDs across all genre pages
     all_ids: set[str] = set()
-    for _genre, url in GENRE_PAGES:
+    for url in GENRE_PAGES:
         ids = _prog_ids_from_page(url)
         logger.info("  %s → %d IDs", url.split("/fr/")[1], len(ids))
         all_ids |= ids
-
     logger.info("Total unique programme IDs: %d", len(all_ids))
 
-    # 2 — fetch metadata concurrently
-    concerts: list[dict] = []
-    with ThreadPoolExecutor(max_workers=10) as pool:
-        results = list(pool.map(_metadata_for_pid, sorted(all_ids)))
-
-    for c in results:
-        if c and c["title"]:
-            concerts.append(c)
-
-    concerts.sort(key=lambda c: c.get("expiry", ""), reverse=True)
+    concerts = _resolve_ids(all_ids)
+    concerts.sort(key=lambda c: c.get("expiry") or "", reverse=True)
     return concerts
 
 
+def _resolve_ids(ids: set[str], exclude: set[str] | None = None) -> list[dict]:
+    to_fetch = ids - (exclude or set())
+    with ThreadPoolExecutor(max_workers=10) as pool:
+        results = list(pool.map(_metadata_for_pid, sorted(to_fetch)))
+    return [c for c in results if c and c.get("title")]
+
+
+def _search_sync(query: str) -> set[str]:
+    url = SEARCH_URL.format(q=quote_plus(query))
+    try:
+        html = _fetch_url(url)
+        return set(_PROG_RE.findall(html))
+    except Exception as ex:
+        logger.warning("Search failed for %r: %s", query, ex)
+        return set()
+
+
+# ── public API ────────────────────────────────────────────────────────────────
+
 async def get_all_concerts() -> list[dict]:
     now = time.time()
     if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
         return _cache["data"]
-
     loop = asyncio.get_event_loop()
     data = await loop.run_in_executor(None, _fetch_all_sync)
     if data:
@@ -119,15 +128,32 @@ async def get_all_concerts() -> list[dict]:
 
 async def fetch_concerts(page: int = 1, search: str = "", page_size: int = 24) -> dict:
     all_c = await get_all_concerts()
-    filtered = all_c
+    cached_ids = {c["id"] for c in all_c}
+
     if search:
         q = search.lower()
-        filtered = [
+        # local filter
+        local = [
             c for c in all_c
             if q in (c.get("title") or "").lower()
             or q in (c.get("subtitle") or "").lower()
             or q in (c.get("description") or "").lower()
         ]
+        # Arte search for IDs not in cache
+        loop = asyncio.get_event_loop()
+        remote_ids = await loop.run_in_executor(None, _search_sync, search)
+        new_ids = remote_ids - cached_ids
+        if new_ids:
+            extra = await loop.run_in_executor(None, _resolve_ids, new_ids, None)
+            # merge: local results first, then extras not already present
+            local_ids = {c["id"] for c in local}
+            for c in extra:
+                if c["id"] not in local_ids:
+                    local.append(c)
+        filtered = local
+    else:
+        filtered = all_c
+
     start = (page - 1) * page_size
     return {
         "concerts": filtered[start : start + page_size],