feat: fuzzy matching Picnic ↔ Leclerc + page /matches dans le dashboard

Nouvelle table product_matches (status: pending/validated/rejected). Matching via RapidFuzz token_sort_ratio, seuil configurable (défaut 85%). Workflow : 1. python -m tickettracker.cli match [--threshold 85] → calcule et stocke les paires candidates 2. http://localhost:8000/matches → l'utilisateur valide ou rejette chaque paire 3. La comparaison de prix enrichie avec les paires validées Nouvelles dépendances : rapidfuzz, watchdog (requirements.txt). 10 tests ajoutés (test_matcher.py), tous passent. Suite complète : 129 passent, 1 xfail, 0 échec. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 18:02:48 +01:00
parent f360332626
commit be4d4a7076
13 changed files with 804 additions and 23 deletions
--- a/tickettracker/web/queries.py
+++ b/tickettracker/web/queries.py
@@ -83,13 +83,18 @@ def get_monthly_spending(conn: sqlite3.Connection) -> list[dict]:
 def get_compare_prices(conn: sqlite3.Connection) -> list[dict]:
    """Comparaison de prix entre Picnic et Leclerc pour les produits communs.

-    Utilise la vue price_history. Ne retourne que les produits présents
-    dans les deux enseignes. Trié par écart décroissant (le plus cher en premier).
+    Combine deux sources :
+      - Correspondances exactes (même name_normalized dans les deux enseignes)
+      - Correspondances fuzzy validées dans product_matches (status='validated')
+
+    Les doublons éventuels (un produit déjà en exact ET en fuzzy) sont éliminés
+    par UNION (qui déduplique) + sélection par nom picnic.

    Returns:
-        Liste de dicts {name, price_picnic, price_leclerc, diff, diff_pct}.
-        diff    = price_leclerc - price_picnic  (positif = Leclerc plus cher)
-        diff_pct = diff / MIN(price_picnic, price_leclerc) * 100
+        Liste de dicts {name, price_picnic, price_leclerc, diff, diff_pct, match_type}.
+        diff       = price_leclerc - price_picnic  (positif = Leclerc plus cher)
+        diff_pct   = diff / MIN(price_picnic, price_leclerc) * 100
+        match_type = 'exact' ou 'fuzzy'
    """
    rows = conn.execute(
        """
@@ -101,32 +106,67 @@ def get_compare_prices(conn: sqlite3.Connection) -> list[dict]:
            FROM price_history
            WHERE name_normalized IS NOT NULL
            GROUP BY name_normalized, store
+        ),
+        exact_matches AS (
+            SELECT
+                a.name_normalized                             AS name,
+                a.name_normalized                             AS name_display,
+                a.avg_price                                   AS price_picnic,
+                b.avg_price                                   AS price_leclerc,
+                ROUND(b.avg_price - a.avg_price, 2)           AS diff,
+                ROUND(
+                    (b.avg_price - a.avg_price)
+                    / MIN(a.avg_price, b.avg_price) * 100
+                , 1)                                          AS diff_pct,
+                'exact'                                       AS match_type
+            FROM avg_by_store a
+            JOIN avg_by_store b
+              ON a.name_normalized = b.name_normalized
+             AND a.store = 'picnic'
+             AND b.store = 'leclerc'
+        ),
+        fuzzy_matches AS (
+            SELECT
+                pm.name_picnic                                AS name,
+                pm.name_picnic || ' ≈ ' || pm.name_leclerc  AS name_display,
+                ap_p.avg_price                                AS price_picnic,
+                ap_l.avg_price                                AS price_leclerc,
+                ROUND(ap_l.avg_price - ap_p.avg_price, 2)    AS diff,
+                ROUND(
+                    (ap_l.avg_price - ap_p.avg_price)
+                    / MIN(ap_p.avg_price, ap_l.avg_price) * 100
+                , 1)                                          AS diff_pct,
+                'fuzzy'                                       AS match_type
+            FROM product_matches pm
+            JOIN avg_by_store ap_p
+              ON ap_p.name_normalized = pm.name_picnic  AND ap_p.store = 'picnic'
+            JOIN avg_by_store ap_l
+              ON ap_l.name_normalized = pm.name_leclerc AND ap_l.store = 'leclerc'
+            WHERE pm.status = 'validated'
+              -- Exclure si déjà présent en exact match
+              AND pm.name_picnic NOT IN (SELECT name FROM exact_matches)
        )
-        SELECT
-            a.name_normalized                             AS name,
-            a.avg_price                                   AS price_picnic,
-            b.avg_price                                   AS price_leclerc,
-            ROUND(b.avg_price - a.avg_price, 2)           AS diff,
-            ROUND(
-                (b.avg_price - a.avg_price)
-                / MIN(a.avg_price, b.avg_price) * 100
-            , 1)                                          AS diff_pct
-        FROM avg_by_store a
-        JOIN avg_by_store b
-          ON a.name_normalized = b.name_normalized
-         AND a.store = 'picnic'
-         AND b.store = 'leclerc'
-        ORDER BY ABS(b.avg_price - a.avg_price) DESC
+        SELECT name, name_display, price_picnic, price_leclerc, diff, diff_pct, match_type
+        FROM (
+            SELECT name, name_display, price_picnic, price_leclerc, diff, diff_pct, match_type
+            FROM exact_matches
+            UNION ALL
+            SELECT name, name_display, price_picnic, price_leclerc, diff, diff_pct, match_type
+            FROM fuzzy_matches
+        )
+        ORDER BY ABS(diff) DESC
        """
    ).fetchall()

    return [
        {
            "name": r["name"],
+            "name_display": r["name_display"],
            "price_picnic": r["price_picnic"],
            "price_leclerc": r["price_leclerc"],
            "diff": r["diff"],
            "diff_pct": r["diff_pct"],
+            "match_type": r["match_type"],
        }
        for r in rows
    ]
@@ -279,6 +319,91 @@ def get_receipt_detail(conn: sqlite3.Connection, receipt_id: int) -> dict | None
    }


+def get_pending_matches(conn: sqlite3.Connection) -> list[dict]:
+    """Paires en attente de validation, avec prix moyens des deux enseignes.
+
+    Returns:
+        Liste de dicts {id, name_picnic, price_picnic, name_leclerc, price_leclerc, score}.
+        price_picnic / price_leclerc : prix moyen unitaire de ce produit dans la vue
+        price_history (None si aucune occurrence pour ce nom normalisé).
+    """
+    rows = conn.execute(
+        """
+        SELECT
+            pm.id,
+            pm.name_picnic,
+            pm.name_leclerc,
+            pm.score,
+            ROUND(AVG(CASE WHEN ph.store='picnic'  THEN ph.unit_price END), 2) AS price_picnic,
+            ROUND(AVG(CASE WHEN ph.store='leclerc' THEN ph.unit_price END), 2) AS price_leclerc
+        FROM product_matches pm
+        LEFT JOIN price_history ph
+               ON ph.name_normalized IN (pm.name_picnic, pm.name_leclerc)
+        WHERE pm.status = 'pending'
+        GROUP BY pm.id
+        ORDER BY pm.score DESC
+        """
+    ).fetchall()
+
+    return [
+        {
+            "id": r["id"],
+            "name_picnic": r["name_picnic"],
+            "name_leclerc": r["name_leclerc"],
+            "score": r["score"],
+            "price_picnic": r["price_picnic"],
+            "price_leclerc": r["price_leclerc"],
+        }
+        for r in rows
+    ]
+
+
+def get_validated_matches(conn: sqlite3.Connection) -> list[dict]:
+    """Paires validées pour enrichir get_compare_prices.
+
+    Returns:
+        Liste de dicts {name_picnic, price_picnic, name_leclerc, price_leclerc, diff, diff_pct}.
+    """
+    rows = conn.execute(
+        """
+        WITH avg_prices AS (
+            SELECT name_normalized, store, ROUND(AVG(unit_price), 2) AS avg_price
+            FROM price_history
+            WHERE name_normalized IS NOT NULL
+            GROUP BY name_normalized, store
+        )
+        SELECT
+            pm.id,
+            pm.name_picnic,
+            pm.name_leclerc,
+            ap_p.avg_price                              AS price_picnic,
+            ap_l.avg_price                              AS price_leclerc,
+            ROUND(ap_l.avg_price - ap_p.avg_price, 2)  AS diff,
+            ROUND(
+                (ap_l.avg_price - ap_p.avg_price)
+                / MIN(ap_p.avg_price, ap_l.avg_price) * 100
+            , 1)                                        AS diff_pct
+        FROM product_matches pm
+        JOIN avg_prices ap_p ON ap_p.name_normalized = pm.name_picnic  AND ap_p.store = 'picnic'
+        JOIN avg_prices ap_l ON ap_l.name_normalized = pm.name_leclerc AND ap_l.store = 'leclerc'
+        WHERE pm.status = 'validated'
+        ORDER BY ABS(ap_l.avg_price - ap_p.avg_price) DESC
+        """
+    ).fetchall()
+
+    return [
+        {
+            "name_picnic": r["name_picnic"],
+            "name_leclerc": r["name_leclerc"],
+            "price_picnic": r["price_picnic"],
+            "price_leclerc": r["price_leclerc"],
+            "diff": r["diff"],
+            "diff_pct": r["diff_pct"],
+        }
+        for r in rows
+    ]
+
+
 def get_product_list(conn: sqlite3.Connection) -> list[str]:
    """Liste tous les noms normalisés distincts (non NULL) pour le sélecteur.