From 44a080cc1323565dd5c7561b0b7c4f03d42f264e Mon Sep 17 00:00:00 2001 From: Remora Date: Mon, 9 Feb 2026 19:19:34 +0100 Subject: [PATCH] Improve: Better HTML text extraction (remove CSS, scripts, nav, footer) --- bot.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/bot.py b/bot.py index 9ee5800..e71d05b 100644 --- a/bot.py +++ b/bot.py @@ -160,14 +160,42 @@ def fetch_url_content(url): # Extract clean text from HTML def extract_text_from_html(html): """Extract readable text from HTML""" - # Remove scripts and styles - text = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) - text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) - # Remove HTML tags - text = re.sub(r'<[^>]+>', ' ', text) + # Remove DOCTYPE, comments + text = re.sub(r']*>', '', html, flags=re.IGNORECASE) + text = re.sub(r'', '', text, flags=re.DOTALL) + + # Remove scripts, styles, noscript + text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) + + # Try to extract main content areas first + main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?', text, flags=re.DOTALL | re.IGNORECASE) + if main_match: + text = main_match.group(0) + + # Remove common nav/footer/sidebar patterns + text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) + + # Remove remaining HTML tags but keep text + text = re.sub(r'', '\n', text, flags=re.IGNORECASE) + text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) + text = re.sub(r']+>', ' ', text) + + # Decode HTML entities + text = text.replace(' ', ' ') + text = text.replace('<', '<') + text = text.replace('>', '>') + text = text.replace('&', '&') + # Clean up whitespace text = re.sub(r'\s+', ' ', text) text = text.strip() + + # Return first meaningful chunk + if len(text) < 100: + return "(Content not accessible)" + return text[:2000] # First 2000 chars of clean text # Analyze content with Haiku via gateway