From 1d8f139c7c1efb87356d0e76c9110c67208b0f54 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 25 Feb 2026 18:35:46 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20inclure=20l'unit=C3=A9/poids=20dans=20l?= =?UTF-8?q?a=20normalisation=20LLM?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fetch_unnormalized() remonte maintenant la colonne `unit` (ex: "250 g", "20 sachets"). Le normaliseur concatène name_raw + unit avant d'envoyer au LLM, qui peut ainsi placer le poids dans le champ format. Résultat : "Haribo dragibus" → "Dragibus | Haribo | 250g" au lieu de "Haribo dragibus" → "Dragibus | Haribo | -" Améliore aussi la qualité du fuzzy matching Picnic ↔ Leclerc. Co-Authored-By: Claude Sonnet 4.6 --- tickettracker/db/repository.py | 4 ++-- tickettracker/llm/normalizer.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tickettracker/db/repository.py b/tickettracker/db/repository.py index 34ce784..9db0c18 100644 --- a/tickettracker/db/repository.py +++ b/tickettracker/db/repository.py @@ -140,7 +140,7 @@ def fetch_unnormalized( ) -> list[sqlite3.Row]: """Retourne les articles dont name_normalized est NULL. - Chaque Row expose les clés : id, name_raw, receipt_id. + Chaque Row expose les clés : id, name_raw, unit, receipt_id. Trié par id pour un traitement reproductible. Args: @@ -150,7 +150,7 @@ def fetch_unnormalized( Returns: Liste de sqlite3.Row. """ - sql = "SELECT id, name_raw, receipt_id FROM items WHERE name_normalized IS NULL ORDER BY id" + sql = "SELECT id, name_raw, unit, receipt_id FROM items WHERE name_normalized IS NULL ORDER BY id" if limit is not None: sql += f" LIMIT {int(limit)}" return conn.execute(sql).fetchall() diff --git a/tickettracker/llm/normalizer.py b/tickettracker/llm/normalizer.py index 41192bc..57a27e8 100644 --- a/tickettracker/llm/normalizer.py +++ b/tickettracker/llm/normalizer.py @@ -229,7 +229,13 @@ def normalize_all_in_db( for start in range(0, total, batch_size): batch = items[start: start + batch_size] - raw_names = [row["name_raw"] for row in batch] + # On inclut l'unité/poids (ex: "250 g", "20 sachets") dans le nom + # envoyé au LLM pour qu'il puisse le placer dans le champ format. + # Pour les articles sans unité (Leclerc OCR), unit est None ou "". + raw_names = [ + f"{row['name_raw']} {row['unit']}".strip() if row["unit"] else row["name_raw"] + for row in batch + ] # --- Tentative batch --- try: @@ -246,7 +252,7 @@ def normalize_all_in_db( # tente le fallback un par un if all(r is None for r in results): logger.debug("Fallback unitaire pour le batch %d–%d.", start, start + len(batch)) - results = [normalize_product_name(name) for name in raw_names] + results = [normalize_product_name(name) for name in raw_names] # raw_names contient déjà l'unité # --- Mise à jour ou affichage --- for item, normalized in zip(batch, results):