Improve: Better title extraction (og:title, h1), description parsing, disable gateway for now
This commit is contained in:
51
bot.py
51
bot.py
@@ -96,21 +96,50 @@ def fetch_url_content(url):
|
||||
try:
|
||||
response = requests.get(
|
||||
url,
|
||||
timeout=5,
|
||||
headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'},
|
||||
timeout=8,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml'
|
||||
},
|
||||
allow_redirects=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
content = response.text[:3000] # First 3k chars
|
||||
content = response.text[:4000] # First 4k chars
|
||||
|
||||
# Extract title
|
||||
title_match = re.search(r'<title[^>]*>([^<]+)</title>', content, re.IGNORECASE)
|
||||
title = title_match.group(1).strip() if title_match else "No title found"
|
||||
# Try multiple patterns for title
|
||||
title = None
|
||||
|
||||
# Pattern 1: <title> tag
|
||||
title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
|
||||
# Pattern 2: og:title meta tag (for GitHub, etc.)
|
||||
if not title:
|
||||
og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||
if og_match:
|
||||
title = og_match.group(1).strip()
|
||||
|
||||
# Pattern 3: h1 tag (for GitHub README)
|
||||
if not title:
|
||||
h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
|
||||
if h1_match:
|
||||
title = h1_match.group(1).strip()
|
||||
|
||||
# Fallback
|
||||
if not title:
|
||||
title = url.split('/')[-1] or "Untitled"
|
||||
|
||||
# Extract meta description
|
||||
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||
description = desc_match.group(1) if desc_match else ""
|
||||
|
||||
# Extract og:description
|
||||
if not description:
|
||||
og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||
if og_desc:
|
||||
description = og_desc.group(1)
|
||||
|
||||
logger.debug(f" ✓ Fetched: {title}")
|
||||
return {
|
||||
"title": title,
|
||||
@@ -249,16 +278,20 @@ class LinkAnalyzerBot(discord.Client):
|
||||
fetch_result = fetch_url_content(url)
|
||||
title = fetch_result["title"]
|
||||
|
||||
# Analyze with gateway
|
||||
# Analyze with gateway (disabled for now - no valid endpoint)
|
||||
analysis = None
|
||||
if fetch_result["status"] == "ok":
|
||||
analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
|
||||
# if fetch_result["status"] == "ok":
|
||||
# analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
|
||||
|
||||
# Add to Tududi
|
||||
tududi_ok = add_to_tududi(title, url, link_type, analysis or "")
|
||||
|
||||
# Format response for Discord
|
||||
response_text = f"📌 **{link_type}**: {title}"
|
||||
if fetch_result.get("description"):
|
||||
# Add description if available
|
||||
desc = fetch_result["description"][:150]
|
||||
response_text += f"\n📝 {desc}"
|
||||
if analysis:
|
||||
# Truncate to 200 chars for Discord
|
||||
summary = analysis[:200].split('\n')[0]
|
||||
|
||||
Reference in New Issue
Block a user