feat: support .eml Picnic + correction fuzzy matching

Support .eml : - pipeline._eml_to_html() extrait le HTML des emails Picnic - Déposer un .eml dans inbox/picnic/ fonctionne comme un .html - Pas de nouvelle dépendance (module email stdlib) - 5 tests ajoutés (test_eml.py) Correction fuzzy matching : - Le score est maintenant calculé sur le nom seul (avant " | ") - Évite que les différences de marque/poids pénalisent le score - Résultat : 8 paires trouvées vs 0 avant la correction Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 18:23:57 +01:00
parent be4d4a7076
commit 8af474c928
3 changed files with 146 additions and 2 deletions
@@ -0,0 +1,107 @@
 """
 Tests de l'extraction HTML depuis les fichiers .eml (pipeline._eml_to_html).
 Stratégie : on construit des .eml synthétiques en mémoire (tmp_path)
 sans dépendre d'un vrai mail Picnic.
 """
 import pytest
 from pathlib import Path
 from tickettracker.pipeline import _eml_to_html
 # ---------------------------------------------------------------------------
 # Helpers pour construire des .eml de test
 # ---------------------------------------------------------------------------
 def _make_eml(tmp_path: Path, html: str, add_text_part: bool = True) -> Path:
    """Crée un fichier .eml multipart/alternative avec une partie HTML."""
    boundary = "BOUNDARY123"
    lines = [
        "MIME-Version: 1.0",
        f'Content-Type: multipart/alternative; boundary="{boundary}"',
        "From: picnic@picnic.app",
        "Subject: Votre commande Picnic",
        "",
        f"--{boundary}",
    ]
    if add_text_part:
        lines += [
            "Content-Type: text/plain; charset=utf-8",
            "",
            "Version texte de l'email.",
            "",
            f"--{boundary}",
        ]
    lines += [
        "Content-Type: text/html; charset=utf-8",
        "",
        html,
        "",
        f"--{boundary}--",
    ]
    p = tmp_path / "ticket.eml"
    p.write_text("\n".join(lines), encoding="utf-8")
    return p
 def _make_eml_no_html(tmp_path: Path) -> Path:
    """Crée un .eml sans partie HTML (texte seul)."""
    boundary = "BOUNDARY456"
    content = "\n".join([
        "MIME-Version: 1.0",
        f'Content-Type: multipart/alternative; boundary="{boundary}"',
        "",
        f"--{boundary}",
        "Content-Type: text/plain; charset=utf-8",
        "",
        "Texte seul, pas de HTML.",
        "",
        f"--{boundary}--",
    ])
    p = tmp_path / "no_html.eml"
    p.write_text(content, encoding="utf-8")
    return p
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 def test_eml_to_html_retourne_le_contenu_html(tmp_path):
    """_eml_to_html extrait correctement le HTML d'un .eml multipart."""
    html = "<html><body><p>Commande Picnic</p></body></html>"
    eml = _make_eml(tmp_path, html)
    result = _eml_to_html(eml)
    assert "Commande Picnic" in result
 def test_eml_to_html_contient_les_balises(tmp_path):
    """Le HTML retourné contient bien les balises HTML."""
    html = "<html><body><h1>Titre</h1></body></html>"
    eml = _make_eml(tmp_path, html)
    result = _eml_to_html(eml)
    assert "<h1>" in result or "Titre" in result
 def test_eml_to_html_retourne_str(tmp_path):
    """_eml_to_html retourne une chaîne de caractères."""
    eml = _make_eml(tmp_path, "<html><body>test</body></html>")
    result = _eml_to_html(eml)
    assert isinstance(result, str)
 def test_eml_to_html_sans_partie_texte(tmp_path):
    """Fonctionne aussi sur un .eml avec uniquement une partie HTML."""
    html = "<html><body><p>HTML only</p></body></html>"
    eml = _make_eml(tmp_path, html, add_text_part=False)
    result = _eml_to_html(eml)
    assert "HTML only" in result
 def test_eml_to_html_leve_valueerror_si_pas_de_html(tmp_path):
    """Lève ValueError si le .eml ne contient aucune partie HTML."""
    eml = _make_eml_no_html(tmp_path)
    with pytest.raises(ValueError, match="Aucune partie HTML"):
        _eml_to_html(eml)
@@ -51,12 +51,17 @@ def find_fuzzy_matches(
    ]
    # Produit cartésien filtré par seuil
    # On compare uniquement le nom (avant le premier " | ") pour éviter que
    # les différences de marque/quantité ("| MDD | 1kg" vs "| - | -") ne
    # pénalisent artificiellement le score.
    matches = []
    for p in picnic_names:
        p_name = p.split(" | ")[0].strip()
        for lec in leclerc_names:
            if p == lec:
                continue  # exact match déjà géré par get_compare_prices
-            score = fuzz.token_sort_ratio(p, lec)
+            lec_name = lec.split(" | ")[0].strip()
            score = fuzz.token_sort_ratio(p_name, lec_name)
            if score >= threshold:
                matches.append({"name_picnic": p, "name_leclerc": lec, "score": score})
@@ -10,7 +10,9 @@ Usage :
    inserted = import_receipt("samples/picnic_sample.html", source="picnic")
 """
 import email
 import logging
 from email import policy
 from pathlib import Path
 from tickettracker.db import schema, repository
@@ -95,6 +97,9 @@ def _parse(file_path: Path, source: str):
    """
    if source == "picnic":
        from tickettracker.parsers import picnic
        if file_path.suffix.lower() == ".eml":
            html_content = _eml_to_html(file_path)
        else:
            html_content = file_path.read_text(encoding="utf-8", errors="replace")
        return picnic.parse(html_content)
@@ -104,3 +109,30 @@ def _parse(file_path: Path, source: str):
    # Jamais atteint grâce à la validation en amont, mais satisfait mypy
    raise ValueError(f"Source inconnue : '{source}'")
 def _eml_to_html(file_path: Path) -> str:
    """Extrait la partie HTML d'un fichier .eml (email de confirmation Picnic).
    Lit le .eml avec le module email stdlib, parcourt les parties MIME
    et retourne le contenu de la première partie text/html trouvée.
    Args:
        file_path: Chemin vers le fichier .eml.
    Returns:
        Contenu HTML sous forme de chaîne.
    Raises:
        ValueError: Si aucune partie HTML n'est trouvée dans le .eml.
    """
    raw = file_path.read_bytes()
    msg = email.message_from_bytes(raw, policy=policy.default)
    for part in msg.walk():
        if part.get_content_type() == "text/html":
            return part.get_content()
    raise ValueError(
        f"Aucune partie HTML trouvée dans le fichier .eml : {file_path.name}"
    )