Improve: Better HTML text extraction (remove CSS, scripts, nav, footer)
This commit is contained in:
38
bot.py
38
bot.py
@@ -160,14 +160,42 @@ def fetch_url_content(url):
|
||||
# Extract clean text from HTML
|
||||
def extract_text_from_html(html):
|
||||
"""Extract readable text from HTML"""
|
||||
# Remove scripts and styles
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
# Remove HTML tags
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
# Remove DOCTYPE, comments
|
||||
text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
|
||||
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
|
||||
|
||||
# Remove scripts, styles, noscript
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# Try to extract main content areas first
|
||||
main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
if main_match:
|
||||
text = main_match.group(0)
|
||||
|
||||
# Remove common nav/footer/sidebar patterns
|
||||
text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# Remove remaining HTML tags but keep text
|
||||
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'</?[^>]+>', ' ', text)
|
||||
|
||||
# Decode HTML entities
|
||||
text = text.replace(' ', ' ')
|
||||
text = text.replace('<', '<')
|
||||
text = text.replace('>', '>')
|
||||
text = text.replace('&', '&')
|
||||
|
||||
# Clean up whitespace
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = text.strip()
|
||||
|
||||
# Return first meaningful chunk
|
||||
if len(text) < 100:
|
||||
return "(Content not accessible)"
|
||||
|
||||
return text[:2000] # First 2000 chars of clean text
|
||||
|
||||
# Analyze content with Haiku via gateway
|
||||
|
||||
Reference in New Issue
Block a user