Improve: Better title extraction (og:title, h1), description parsing, disable gateway for now

This commit is contained in:
Remora
2026-02-09 18:50:14 +01:00
parent 03999875b5
commit bec7f13d82

51
bot.py
View File

@@ -96,21 +96,50 @@ def fetch_url_content(url):
try: try:
response = requests.get( response = requests.get(
url, url,
timeout=5, timeout=8,
headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}, headers={
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml'
},
allow_redirects=True allow_redirects=True
) )
response.raise_for_status() response.raise_for_status()
content = response.text[:3000] # First 3k chars content = response.text[:4000] # First 4k chars
# Extract title # Try multiple patterns for title
title_match = re.search(r'<title[^>]*>([^<]+)</title>', content, re.IGNORECASE) title = None
title = title_match.group(1).strip() if title_match else "No title found"
# Pattern 1: <title> tag
title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Pattern 2: og:title meta tag (for GitHub, etc.)
if not title:
og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
if og_match:
title = og_match.group(1).strip()
# Pattern 3: h1 tag (for GitHub README)
if not title:
h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
if h1_match:
title = h1_match.group(1).strip()
# Fallback
if not title:
title = url.split('/')[-1] or "Untitled"
# Extract meta description # Extract meta description
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE) desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
description = desc_match.group(1) if desc_match else "" description = desc_match.group(1) if desc_match else ""
# Extract og:description
if not description:
og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
if og_desc:
description = og_desc.group(1)
logger.debug(f" ✓ Fetched: {title}") logger.debug(f" ✓ Fetched: {title}")
return { return {
"title": title, "title": title,
@@ -249,16 +278,20 @@ class LinkAnalyzerBot(discord.Client):
fetch_result = fetch_url_content(url) fetch_result = fetch_url_content(url)
title = fetch_result["title"] title = fetch_result["title"]
# Analyze with gateway # Analyze with gateway (disabled for now - no valid endpoint)
analysis = None analysis = None
if fetch_result["status"] == "ok": # if fetch_result["status"] == "ok":
analysis = analyze_with_gateway(url, title, fetch_result.get("content", "")) # analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
# Add to Tududi # Add to Tududi
tududi_ok = add_to_tududi(title, url, link_type, analysis or "") tududi_ok = add_to_tududi(title, url, link_type, analysis or "")
# Format response for Discord # Format response for Discord
response_text = f"📌 **{link_type}**: {title}" response_text = f"📌 **{link_type}**: {title}"
if fetch_result.get("description"):
# Add description if available
desc = fetch_result["description"][:150]
response_text += f"\n📝 {desc}"
if analysis: if analysis:
# Truncate to 200 chars for Discord # Truncate to 200 chars for Discord
summary = analysis[:200].split('\n')[0] summary = analysis[:200].split('\n')[0]