From 44a080cc1323565dd5c7561b0b7c4f03d42f264e Mon Sep 17 00:00:00 2001
From: Remora <remora@dilain.com>
Date: Mon, 9 Feb 2026 19:19:34 +0100
Subject: [PATCH] Improve: Better HTML text extraction (remove CSS, scripts,
 nav, footer)

---
 bot.py | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)
diff --git a/bot.py b/bot.py
index 9ee5800..e71d05b 100644
--- a/bot.py
+++ b/bot.py
@@ -160,14 +160,42 @@ def fetch_url_content(url):
 # Extract clean text from HTML
 def extract_text_from_html(html):
     """Extract readable text from HTML"""
-    # Remove scripts and styles
-    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
-    # Remove HTML tags
-    text = re.sub(r'<[^>]+>', ' ', text)
+    # Remove DOCTYPE, comments
+    text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
+    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
+    
+    # Remove scripts, styles, noscript
+    text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    
+    # Try to extract main content areas first
+    main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
+    if main_match:
+        text = main_match.group(0)
+    
+    # Remove common nav/footer/sidebar patterns
+    text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    
+    # Remove remaining HTML tags but keep text
+    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'</?[^>]+>', ' ', text)
+    
+    # Decode HTML entities
+    text = text.replace('&nbsp;', ' ')
+    text = text.replace('&lt;', '<')
+    text = text.replace('&gt;', '>')
+    text = text.replace('&amp;', '&')
+    
     # Clean up whitespace
     text = re.sub(r'\s+', ' ', text)
     text = text.strip()
+    
+    # Return first meaningful chunk
+    if len(text) < 100:
+        return "(Content not accessible)"
+    
     return text[:2000]  # First 2000 chars of clean text
 
 # Analyze content with Haiku via gateway