feat: add PDF parsing support using pdfplumber for text extraction and receipt data parsing

Co-authored-by: aider (openai/unsloth/Qwen3-Coder-Next) <aider@aider.chat>
2026-02-06 17:54:51 +01:00
parent 2474520514
commit 7016c3b3ec
2 changed files with 94 additions and 7 deletions
@@ -1,9 +1,11 @@
 import sqlite3
 from datetime import datetime
 import os
+import re
 import discord
 from discord.ext import commands
 import io
+import pdfplumber

 DB_PATH = "grocery_receipts.db"

@@ -55,6 +57,81 @@ def init_db():
    conn.commit()
    conn.close()

+def extract_text_from_pdf(pdf_path):
+    """Extract text from a PDF file using pdfplumber."""
+    text = ""
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() or ""
+    except Exception as e:
+        print(f"Error extracting text from PDF: {e}")
+        return ""
+    return text
+
+def parse_receipt_text(text):
+    """
+    Parse receipt text to extract store name, date, and items.
+    This is a basic parser that can be improved with more sophisticated logic.
+    
+    Returns:
+        Tuple of (store_name, date, items_list)
+    """
+    # Extract date (looking for common date patterns)
+    date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}'
+    date_match = re.search(date_pattern, text)
+    date = date_match.group() if date_match else datetime.now().strftime('%Y-%m-%d')
+    
+    # Try to extract store name (first line or lines containing common store keywords)
+    lines = text.split('\n')
+    store_name = "Unknown Store"
+    for line in lines[:5]:  # Check first few lines
+        if any(keyword in line.lower() for keyword in ['supermarket', 'store', 'grocery', 'market', 'shop', 'saint', 'sainte']):
+            store_name = line.strip()
+            break
+    
+    # Extract items (lines with price patterns)
+    items = []
+    # Look for lines that have product names followed by prices
+    item_pattern = r'^(.+?)\s+(\d+\.?\d*)\s*(x|\*)?\s*(\d+\.?\d*)\s*$'
+    
+    for line in lines:
+        line = line.strip()
+        # Skip empty lines and lines that are likely headers/footers
+        if not line or any(skip_word in line.lower() for skip_word in ['total', 'subtotal', 'payment', 'change', 'receipt', 'store']):
+            continue
+            
+        # Try to match item patterns
+        match = re.match(r'(.+?)\s+(\d+\.?\d*)\s*x?\s*(\d+\.?\d*)', line, re.IGNORECASE)
+        if match:
+            product_name = match.group(1).strip()
+            try:
+                quantity = float(match.group(2))
+                price = float(match.group(3))
+                items.append((product_name, quantity, price))
+            except ValueError:
+                continue
+    
+    # If no items found with the pattern, try simpler parsing
+    if not items:
+        for line in lines:
+            line = line.strip()
+            # Look for lines with prices (containing decimal points)
+            price_match = re.search(r'(\d+\.?\d*)\s*$', line)
+            if price_match and len(line.split()) > 1:
+                # Extract product name and price
+                parts = line.rsplit(' ', 1)
+                if len(parts) == 2:
+                    try:
+                        product_name = parts[0].strip()
+                        price = float(parts[1])
+                        # Assume quantity 1 if not specified
+                        items.append((product_name, 1.0, price))
+                    except ValueError:
+                        continue
+    
+    return store_name, date, items
+
 def add_receipt(store_name, date, items):
    """
    Add a receipt to the database.
@@ -181,17 +258,26 @@ async def on_message(message):
                # Download the PDF
                pdf_bytes = await attachment.read()
                
-                # For now, we'll just acknowledge receipt and save the file
-                # In a full implementation, you'd extract text from the PDF using OCR
-                
                # Save the PDF to a receipts folder
                os.makedirs('receipts', exist_ok=True)
                file_path = os.path.join('receipts', attachment.filename)
                with open(file_path, 'wb') as f:
                    f.write(pdf_bytes)
                
-                # Send confirmation message
-                await message.channel.send(f"Receipt '{attachment.filename}' received and saved!")
+                # Extract text and parse the receipt
+                try:
+                    text = extract_text_from_pdf(file_path)
+                    if text:
+                        store_name, date, items = parse_receipt_text(text)
+                        if items:
+                            add_receipt(store_name, date, items)
+                            await message.channel.send(f"Receipt '{attachment.filename}' processed! Found {len(items)} items.")
+                        else:
+                            await message.channel.send(f"Receipt '{attachment.filename}' saved but couldn't parse items. Please check the format.")
+                    else:
+                        await message.channel.send(f"Could not extract text from '{attachment.filename}'. Is it a text-based PDF?")
+                except Exception as e:
+                    await message.channel.send(f"Error processing receipt: {str(e)}")

@bot.command(name='add_receipt')
 async def add_receipt_command(ctx, store_name: str, date: str, *, items: str):
@@ -1,6 +1,7 @@
 # Core dependencies for the grocery receipt tracker
 discord.py>=2.0.0
-# For PDF processing (optional, for future OCR implementation)
-# PyPDF2
+# PDF processing
+pdfplumber>=0.9.0
+# For image processing (optional, for scanned receipts)
 # pytesseract
 # pillow