diff --git a/app.py b/app.py index 27d897f..734d5a2 100644 --- a/app.py +++ b/app.py @@ -1,9 +1,11 @@ import sqlite3 from datetime import datetime import os +import re import discord from discord.ext import commands import io +import pdfplumber DB_PATH = "grocery_receipts.db" @@ -55,6 +57,81 @@ def init_db(): conn.commit() conn.close() +def extract_text_from_pdf(pdf_path): + """Extract text from a PDF file using pdfplumber.""" + text = "" + try: + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages: + text += page.extract_text() or "" + except Exception as e: + print(f"Error extracting text from PDF: {e}") + return "" + return text + +def parse_receipt_text(text): + """ + Parse receipt text to extract store name, date, and items. + This is a basic parser that can be improved with more sophisticated logic. + + Returns: + Tuple of (store_name, date, items_list) + """ + # Extract date (looking for common date patterns) + date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}' + date_match = re.search(date_pattern, text) + date = date_match.group() if date_match else datetime.now().strftime('%Y-%m-%d') + + # Try to extract store name (first line or lines containing common store keywords) + lines = text.split('\n') + store_name = "Unknown Store" + for line in lines[:5]: # Check first few lines + if any(keyword in line.lower() for keyword in ['supermarket', 'store', 'grocery', 'market', 'shop', 'saint', 'sainte']): + store_name = line.strip() + break + + # Extract items (lines with price patterns) + items = [] + # Look for lines that have product names followed by prices + item_pattern = r'^(.+?)\s+(\d+\.?\d*)\s*(x|\*)?\s*(\d+\.?\d*)\s*$' + + for line in lines: + line = line.strip() + # Skip empty lines and lines that are likely headers/footers + if not line or any(skip_word in line.lower() for skip_word in ['total', 'subtotal', 'payment', 'change', 'receipt', 'store']): + continue + + # Try to match item patterns + match = re.match(r'(.+?)\s+(\d+\.?\d*)\s*x?\s*(\d+\.?\d*)', line, re.IGNORECASE) + if match: + product_name = match.group(1).strip() + try: + quantity = float(match.group(2)) + price = float(match.group(3)) + items.append((product_name, quantity, price)) + except ValueError: + continue + + # If no items found with the pattern, try simpler parsing + if not items: + for line in lines: + line = line.strip() + # Look for lines with prices (containing decimal points) + price_match = re.search(r'(\d+\.?\d*)\s*$', line) + if price_match and len(line.split()) > 1: + # Extract product name and price + parts = line.rsplit(' ', 1) + if len(parts) == 2: + try: + product_name = parts[0].strip() + price = float(parts[1]) + # Assume quantity 1 if not specified + items.append((product_name, 1.0, price)) + except ValueError: + continue + + return store_name, date, items + def add_receipt(store_name, date, items): """ Add a receipt to the database. @@ -181,17 +258,26 @@ async def on_message(message): # Download the PDF pdf_bytes = await attachment.read() - # For now, we'll just acknowledge receipt and save the file - # In a full implementation, you'd extract text from the PDF using OCR - # Save the PDF to a receipts folder os.makedirs('receipts', exist_ok=True) file_path = os.path.join('receipts', attachment.filename) with open(file_path, 'wb') as f: f.write(pdf_bytes) - # Send confirmation message - await message.channel.send(f"Receipt '{attachment.filename}' received and saved!") + # Extract text and parse the receipt + try: + text = extract_text_from_pdf(file_path) + if text: + store_name, date, items = parse_receipt_text(text) + if items: + add_receipt(store_name, date, items) + await message.channel.send(f"Receipt '{attachment.filename}' processed! Found {len(items)} items.") + else: + await message.channel.send(f"Receipt '{attachment.filename}' saved but couldn't parse items. Please check the format.") + else: + await message.channel.send(f"Could not extract text from '{attachment.filename}'. Is it a text-based PDF?") + except Exception as e: + await message.channel.send(f"Error processing receipt: {str(e)}") @bot.command(name='add_receipt') async def add_receipt_command(ctx, store_name: str, date: str, *, items: str): diff --git a/requirements.txt b/requirements.txt index 2f64fd7..d364d95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # Core dependencies for the grocery receipt tracker discord.py>=2.0.0 -# For PDF processing (optional, for future OCR implementation) -# PyPDF2 +# PDF processing +pdfplumber>=0.9.0 +# For image processing (optional, for scanned receipts) # pytesseract # pillow