diff --git a/bot.py b/bot.py
index 9ee5800..e71d05b 100644
--- a/bot.py
+++ b/bot.py
@@ -160,14 +160,42 @@ def fetch_url_content(url):
# Extract clean text from HTML
def extract_text_from_html(html):
"""Extract readable text from HTML"""
- # Remove scripts and styles
- text = re.sub(r'', '', html, flags=re.DOTALL | re.IGNORECASE)
- text = re.sub(r'', '', text, flags=re.DOTALL | re.IGNORECASE)
- # Remove HTML tags
- text = re.sub(r'<[^>]+>', ' ', text)
+ # Remove DOCTYPE, comments
+ text = re.sub(r']*>', '', html, flags=re.IGNORECASE)
+ text = re.sub(r'', '', text, flags=re.DOTALL)
+
+ # Remove scripts, styles, noscript
+ text = re.sub(r'', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+ text = re.sub(r'', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+ text = re.sub(r'', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+
+ # Try to extract main content areas first
+ main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?\1>', text, flags=re.DOTALL | re.IGNORECASE)
+ if main_match:
+ text = main_match.group(0)
+
+ # Remove common nav/footer/sidebar patterns
+ text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+
+ # Remove remaining HTML tags but keep text
+ text = re.sub(r'
', '\n', text, flags=re.IGNORECASE)
+ text = re.sub(r'
]*>', '\n', text, flags=re.IGNORECASE) + text = re.sub(r'?[^>]+>', ' ', text) + + # Decode HTML entities + text = text.replace(' ', ' ') + text = text.replace('<', '<') + text = text.replace('>', '>') + text = text.replace('&', '&') + # Clean up whitespace text = re.sub(r'\s+', ' ', text) text = text.strip() + + # Return first meaningful chunk + if len(text) < 100: + return "(Content not accessible)" + return text[:2000] # First 2000 chars of clean text # Analyze content with Haiku via gateway