Improve: Better HTML text extraction (remove CSS, scripts, nav, footer)

2026-02-09 19:19:34 +01:00
parent d0ca96191c
commit 44a080cc13
1 changed files with 33 additions and 5 deletions
@@ -160,14 +160,42 @@ def fetch_url_content(url):
 # Extract clean text from HTML
 def extract_text_from_html(html):
    """Extract readable text from HTML"""
-    # Remove scripts and styles
+    # Remove DOCTYPE, comments
-    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
-    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
-    # Remove HTML tags
+    
-    text = re.sub(r'<[^>]+>', ' ', text)
+    # Remove scripts, styles, noscript
    text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
    # Try to extract main content areas first
    main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
    if main_match:
        text = main_match.group(0)
    # Remove common nav/footer/sidebar patterns
    text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
    # Remove remaining HTML tags but keep text
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</?[^>]+>', ' ', text)
    # Decode HTML entities
    text = text.replace('&nbsp;', ' ')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('&amp;', '&')
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    # Return first meaningful chunk
    if len(text) < 100:
        return "(Content not accessible)"
    return text[:2000]  # First 2000 chars of clean text
 # Analyze content with Haiku via gateway