Improve: Better HTML text extraction (remove CSS, scripts, nav, footer)

2026-02-09 19:19:34 +01:00
parent d0ca96191c
commit 44a080cc13
1 changed files with 33 additions and 5 deletions
@@ -160,14 +160,42 @@ def fetch_url_content(url):
 # Extract clean text from HTML
 def extract_text_from_html(html):
    """Extract readable text from HTML"""
-    # Remove scripts and styles
-    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
-    # Remove HTML tags
-    text = re.sub(r'<[^>]+>', ' ', text)
+    # Remove DOCTYPE, comments
+    text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
+    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
+    
+    # Remove scripts, styles, noscript
+    text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    
+    # Try to extract main content areas first
+    main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
+    if main_match:
+        text = main_match.group(0)
+    
+    # Remove common nav/footer/sidebar patterns
+    text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
+    
+    # Remove remaining HTML tags but keep text
+    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'</?[^>]+>', ' ', text)
+    
+    # Decode HTML entities
+    text = text.replace('&nbsp;', ' ')
+    text = text.replace('&lt;', '<')
+    text = text.replace('&gt;', '>')
+    text = text.replace('&amp;', '&')
+    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
+    
+    # Return first meaningful chunk
+    if len(text) < 100:
+        return "(Content not accessible)"
+    
    return text[:2000]  # First 2000 chars of clean text

 # Analyze content with Haiku via gateway