feat: add PDF parsing support using pdfplumber for text extraction and receipt data parsing
Co-authored-by: aider (openai/unsloth/Qwen3-Coder-Next) <aider@aider.chat>
This commit is contained in:
96
app.py
96
app.py
@@ -1,9 +1,11 @@
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import discord
|
import discord
|
||||||
from discord.ext import commands
|
from discord.ext import commands
|
||||||
import io
|
import io
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
DB_PATH = "grocery_receipts.db"
|
DB_PATH = "grocery_receipts.db"
|
||||||
|
|
||||||
@@ -55,6 +57,81 @@ def init_db():
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
def extract_text_from_pdf(pdf_path):
|
||||||
|
"""Extract text from a PDF file using pdfplumber."""
|
||||||
|
text = ""
|
||||||
|
try:
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
text += page.extract_text() or ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error extracting text from PDF: {e}")
|
||||||
|
return ""
|
||||||
|
return text
|
||||||
|
|
||||||
|
def parse_receipt_text(text):
|
||||||
|
"""
|
||||||
|
Parse receipt text to extract store name, date, and items.
|
||||||
|
This is a basic parser that can be improved with more sophisticated logic.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (store_name, date, items_list)
|
||||||
|
"""
|
||||||
|
# Extract date (looking for common date patterns)
|
||||||
|
date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}'
|
||||||
|
date_match = re.search(date_pattern, text)
|
||||||
|
date = date_match.group() if date_match else datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
# Try to extract store name (first line or lines containing common store keywords)
|
||||||
|
lines = text.split('\n')
|
||||||
|
store_name = "Unknown Store"
|
||||||
|
for line in lines[:5]: # Check first few lines
|
||||||
|
if any(keyword in line.lower() for keyword in ['supermarket', 'store', 'grocery', 'market', 'shop', 'saint', 'sainte']):
|
||||||
|
store_name = line.strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract items (lines with price patterns)
|
||||||
|
items = []
|
||||||
|
# Look for lines that have product names followed by prices
|
||||||
|
item_pattern = r'^(.+?)\s+(\d+\.?\d*)\s*(x|\*)?\s*(\d+\.?\d*)\s*$'
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
# Skip empty lines and lines that are likely headers/footers
|
||||||
|
if not line or any(skip_word in line.lower() for skip_word in ['total', 'subtotal', 'payment', 'change', 'receipt', 'store']):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try to match item patterns
|
||||||
|
match = re.match(r'(.+?)\s+(\d+\.?\d*)\s*x?\s*(\d+\.?\d*)', line, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
product_name = match.group(1).strip()
|
||||||
|
try:
|
||||||
|
quantity = float(match.group(2))
|
||||||
|
price = float(match.group(3))
|
||||||
|
items.append((product_name, quantity, price))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If no items found with the pattern, try simpler parsing
|
||||||
|
if not items:
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
# Look for lines with prices (containing decimal points)
|
||||||
|
price_match = re.search(r'(\d+\.?\d*)\s*$', line)
|
||||||
|
if price_match and len(line.split()) > 1:
|
||||||
|
# Extract product name and price
|
||||||
|
parts = line.rsplit(' ', 1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
try:
|
||||||
|
product_name = parts[0].strip()
|
||||||
|
price = float(parts[1])
|
||||||
|
# Assume quantity 1 if not specified
|
||||||
|
items.append((product_name, 1.0, price))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return store_name, date, items
|
||||||
|
|
||||||
def add_receipt(store_name, date, items):
|
def add_receipt(store_name, date, items):
|
||||||
"""
|
"""
|
||||||
Add a receipt to the database.
|
Add a receipt to the database.
|
||||||
@@ -181,17 +258,26 @@ async def on_message(message):
|
|||||||
# Download the PDF
|
# Download the PDF
|
||||||
pdf_bytes = await attachment.read()
|
pdf_bytes = await attachment.read()
|
||||||
|
|
||||||
# For now, we'll just acknowledge receipt and save the file
|
|
||||||
# In a full implementation, you'd extract text from the PDF using OCR
|
|
||||||
|
|
||||||
# Save the PDF to a receipts folder
|
# Save the PDF to a receipts folder
|
||||||
os.makedirs('receipts', exist_ok=True)
|
os.makedirs('receipts', exist_ok=True)
|
||||||
file_path = os.path.join('receipts', attachment.filename)
|
file_path = os.path.join('receipts', attachment.filename)
|
||||||
with open(file_path, 'wb') as f:
|
with open(file_path, 'wb') as f:
|
||||||
f.write(pdf_bytes)
|
f.write(pdf_bytes)
|
||||||
|
|
||||||
# Send confirmation message
|
# Extract text and parse the receipt
|
||||||
await message.channel.send(f"Receipt '{attachment.filename}' received and saved!")
|
try:
|
||||||
|
text = extract_text_from_pdf(file_path)
|
||||||
|
if text:
|
||||||
|
store_name, date, items = parse_receipt_text(text)
|
||||||
|
if items:
|
||||||
|
add_receipt(store_name, date, items)
|
||||||
|
await message.channel.send(f"Receipt '{attachment.filename}' processed! Found {len(items)} items.")
|
||||||
|
else:
|
||||||
|
await message.channel.send(f"Receipt '{attachment.filename}' saved but couldn't parse items. Please check the format.")
|
||||||
|
else:
|
||||||
|
await message.channel.send(f"Could not extract text from '{attachment.filename}'. Is it a text-based PDF?")
|
||||||
|
except Exception as e:
|
||||||
|
await message.channel.send(f"Error processing receipt: {str(e)}")
|
||||||
|
|
||||||
@bot.command(name='add_receipt')
|
@bot.command(name='add_receipt')
|
||||||
async def add_receipt_command(ctx, store_name: str, date: str, *, items: str):
|
async def add_receipt_command(ctx, store_name: str, date: str, *, items: str):
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
# Core dependencies for the grocery receipt tracker
|
# Core dependencies for the grocery receipt tracker
|
||||||
discord.py>=2.0.0
|
discord.py>=2.0.0
|
||||||
# For PDF processing (optional, for future OCR implementation)
|
# PDF processing
|
||||||
# PyPDF2
|
pdfplumber>=0.9.0
|
||||||
|
# For image processing (optional, for scanned receipts)
|
||||||
# pytesseract
|
# pytesseract
|
||||||
# pillow
|
# pillow
|
||||||
|
|||||||
Reference in New Issue
Block a user