Improve: Better HTML text extraction (remove CSS, scripts, nav, footer)

This commit is contained in:
Remora
2026-02-09 19:19:34 +01:00
parent d0ca96191c
commit 44a080cc13

38
bot.py
View File

@@ -160,14 +160,42 @@ def fetch_url_content(url):
# Extract clean text from HTML # Extract clean text from HTML
def extract_text_from_html(html): def extract_text_from_html(html):
"""Extract readable text from HTML""" """Extract readable text from HTML"""
# Remove scripts and styles # Remove DOCTYPE, comments
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
# Remove HTML tags
text = re.sub(r'<[^>]+>', ' ', text) # Remove scripts, styles, noscript
text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Try to extract main content areas first
main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
if main_match:
text = main_match.group(0)
# Remove common nav/footer/sidebar patterns
text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Remove remaining HTML tags but keep text
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'</?[^>]+>', ' ', text)
# Decode HTML entities
text = text.replace('&nbsp;', ' ')
text = text.replace('&lt;', '<')
text = text.replace('&gt;', '>')
text = text.replace('&amp;', '&')
# Clean up whitespace # Clean up whitespace
text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s+', ' ', text)
text = text.strip() text = text.strip()
# Return first meaningful chunk
if len(text) < 100:
return "(Content not accessible)"
return text[:2000] # First 2000 chars of clean text return text[:2000] # First 2000 chars of clean text
# Analyze content with Haiku via gateway # Analyze content with Haiku via gateway