feat: add PDF parsing support using pdfplumber for text extraction and receipt data parsing

Co-authored-by: aider (openai/unsloth/Qwen3-Coder-Next) <aider@aider.chat>
This commit is contained in:
Your Name
2026-02-06 17:54:51 +01:00
parent 2474520514
commit 7016c3b3ec
2 changed files with 94 additions and 7 deletions

96
app.py
View File

@@ -1,9 +1,11 @@
import sqlite3 import sqlite3
from datetime import datetime from datetime import datetime
import os import os
import re
import discord import discord
from discord.ext import commands from discord.ext import commands
import io import io
import pdfplumber
DB_PATH = "grocery_receipts.db" DB_PATH = "grocery_receipts.db"
@@ -55,6 +57,81 @@ def init_db():
conn.commit() conn.commit()
conn.close() conn.close()
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file using pdfplumber."""
text = ""
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return ""
return text
def parse_receipt_text(text):
"""
Parse receipt text to extract store name, date, and items.
This is a basic parser that can be improved with more sophisticated logic.
Returns:
Tuple of (store_name, date, items_list)
"""
# Extract date (looking for common date patterns)
date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}'
date_match = re.search(date_pattern, text)
date = date_match.group() if date_match else datetime.now().strftime('%Y-%m-%d')
# Try to extract store name (first line or lines containing common store keywords)
lines = text.split('\n')
store_name = "Unknown Store"
for line in lines[:5]: # Check first few lines
if any(keyword in line.lower() for keyword in ['supermarket', 'store', 'grocery', 'market', 'shop', 'saint', 'sainte']):
store_name = line.strip()
break
# Extract items (lines with price patterns)
items = []
# Look for lines that have product names followed by prices
item_pattern = r'^(.+?)\s+(\d+\.?\d*)\s*(x|\*)?\s*(\d+\.?\d*)\s*$'
for line in lines:
line = line.strip()
# Skip empty lines and lines that are likely headers/footers
if not line or any(skip_word in line.lower() for skip_word in ['total', 'subtotal', 'payment', 'change', 'receipt', 'store']):
continue
# Try to match item patterns
match = re.match(r'(.+?)\s+(\d+\.?\d*)\s*x?\s*(\d+\.?\d*)', line, re.IGNORECASE)
if match:
product_name = match.group(1).strip()
try:
quantity = float(match.group(2))
price = float(match.group(3))
items.append((product_name, quantity, price))
except ValueError:
continue
# If no items found with the pattern, try simpler parsing
if not items:
for line in lines:
line = line.strip()
# Look for lines with prices (containing decimal points)
price_match = re.search(r'(\d+\.?\d*)\s*$', line)
if price_match and len(line.split()) > 1:
# Extract product name and price
parts = line.rsplit(' ', 1)
if len(parts) == 2:
try:
product_name = parts[0].strip()
price = float(parts[1])
# Assume quantity 1 if not specified
items.append((product_name, 1.0, price))
except ValueError:
continue
return store_name, date, items
def add_receipt(store_name, date, items): def add_receipt(store_name, date, items):
""" """
Add a receipt to the database. Add a receipt to the database.
@@ -181,17 +258,26 @@ async def on_message(message):
# Download the PDF # Download the PDF
pdf_bytes = await attachment.read() pdf_bytes = await attachment.read()
# For now, we'll just acknowledge receipt and save the file
# In a full implementation, you'd extract text from the PDF using OCR
# Save the PDF to a receipts folder # Save the PDF to a receipts folder
os.makedirs('receipts', exist_ok=True) os.makedirs('receipts', exist_ok=True)
file_path = os.path.join('receipts', attachment.filename) file_path = os.path.join('receipts', attachment.filename)
with open(file_path, 'wb') as f: with open(file_path, 'wb') as f:
f.write(pdf_bytes) f.write(pdf_bytes)
# Send confirmation message # Extract text and parse the receipt
await message.channel.send(f"Receipt '{attachment.filename}' received and saved!") try:
text = extract_text_from_pdf(file_path)
if text:
store_name, date, items = parse_receipt_text(text)
if items:
add_receipt(store_name, date, items)
await message.channel.send(f"Receipt '{attachment.filename}' processed! Found {len(items)} items.")
else:
await message.channel.send(f"Receipt '{attachment.filename}' saved but couldn't parse items. Please check the format.")
else:
await message.channel.send(f"Could not extract text from '{attachment.filename}'. Is it a text-based PDF?")
except Exception as e:
await message.channel.send(f"Error processing receipt: {str(e)}")
@bot.command(name='add_receipt') @bot.command(name='add_receipt')
async def add_receipt_command(ctx, store_name: str, date: str, *, items: str): async def add_receipt_command(ctx, store_name: str, date: str, *, items: str):

View File

@@ -1,6 +1,7 @@
# Core dependencies for the grocery receipt tracker # Core dependencies for the grocery receipt tracker
discord.py>=2.0.0 discord.py>=2.0.0
# For PDF processing (optional, for future OCR implementation) # PDF processing
# PyPDF2 pdfplumber>=0.9.0
# For image processing (optional, for scanned receipts)
# pytesseract # pytesseract
# pillow # pillow