diff --git a/tickettracker/db/repository.py b/tickettracker/db/repository.py index 34ce784..9db0c18 100644 --- a/tickettracker/db/repository.py +++ b/tickettracker/db/repository.py @@ -140,7 +140,7 @@ def fetch_unnormalized( ) -> list[sqlite3.Row]: """Retourne les articles dont name_normalized est NULL. - Chaque Row expose les clés : id, name_raw, receipt_id. + Chaque Row expose les clés : id, name_raw, unit, receipt_id. Trié par id pour un traitement reproductible. Args: @@ -150,7 +150,7 @@ def fetch_unnormalized( Returns: Liste de sqlite3.Row. """ - sql = "SELECT id, name_raw, receipt_id FROM items WHERE name_normalized IS NULL ORDER BY id" + sql = "SELECT id, name_raw, unit, receipt_id FROM items WHERE name_normalized IS NULL ORDER BY id" if limit is not None: sql += f" LIMIT {int(limit)}" return conn.execute(sql).fetchall() diff --git a/tickettracker/llm/normalizer.py b/tickettracker/llm/normalizer.py index 41192bc..57a27e8 100644 --- a/tickettracker/llm/normalizer.py +++ b/tickettracker/llm/normalizer.py @@ -229,7 +229,13 @@ def normalize_all_in_db( for start in range(0, total, batch_size): batch = items[start: start + batch_size] - raw_names = [row["name_raw"] for row in batch] + # On inclut l'unité/poids (ex: "250 g", "20 sachets") dans le nom + # envoyé au LLM pour qu'il puisse le placer dans le champ format. + # Pour les articles sans unité (Leclerc OCR), unit est None ou "". + raw_names = [ + f"{row['name_raw']} {row['unit']}".strip() if row["unit"] else row["name_raw"] + for row in batch + ] # --- Tentative batch --- try: @@ -246,7 +252,7 @@ def normalize_all_in_db( # tente le fallback un par un if all(r is None for r in results): logger.debug("Fallback unitaire pour le batch %d–%d.", start, start + len(batch)) - results = [normalize_product_name(name) for name in raw_names] + results = [normalize_product_name(name) for name in raw_names] # raw_names contient déjà l'unité # --- Mise à jour ou affichage --- for item, normalized in zip(batch, results):