Improve: Better title extraction (og:title, h1), description parsing, disable gateway for now
This commit is contained in:
51
bot.py
51
bot.py
@@ -96,21 +96,50 @@ def fetch_url_content(url):
|
|||||||
try:
|
try:
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url,
|
url,
|
||||||
timeout=5,
|
timeout=8,
|
||||||
headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'},
|
headers={
|
||||||
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml'
|
||||||
|
},
|
||||||
allow_redirects=True
|
allow_redirects=True
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
content = response.text[:3000] # First 3k chars
|
content = response.text[:4000] # First 4k chars
|
||||||
|
|
||||||
# Extract title
|
# Try multiple patterns for title
|
||||||
title_match = re.search(r'<title[^>]*>([^<]+)</title>', content, re.IGNORECASE)
|
title = None
|
||||||
title = title_match.group(1).strip() if title_match else "No title found"
|
|
||||||
|
# Pattern 1: <title> tag
|
||||||
|
title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
|
||||||
|
if title_match:
|
||||||
|
title = title_match.group(1).strip()
|
||||||
|
|
||||||
|
# Pattern 2: og:title meta tag (for GitHub, etc.)
|
||||||
|
if not title:
|
||||||
|
og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||||
|
if og_match:
|
||||||
|
title = og_match.group(1).strip()
|
||||||
|
|
||||||
|
# Pattern 3: h1 tag (for GitHub README)
|
||||||
|
if not title:
|
||||||
|
h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
|
||||||
|
if h1_match:
|
||||||
|
title = h1_match.group(1).strip()
|
||||||
|
|
||||||
|
# Fallback
|
||||||
|
if not title:
|
||||||
|
title = url.split('/')[-1] or "Untitled"
|
||||||
|
|
||||||
# Extract meta description
|
# Extract meta description
|
||||||
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
|
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||||
description = desc_match.group(1) if desc_match else ""
|
description = desc_match.group(1) if desc_match else ""
|
||||||
|
|
||||||
|
# Extract og:description
|
||||||
|
if not description:
|
||||||
|
og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||||
|
if og_desc:
|
||||||
|
description = og_desc.group(1)
|
||||||
|
|
||||||
logger.debug(f" ✓ Fetched: {title}")
|
logger.debug(f" ✓ Fetched: {title}")
|
||||||
return {
|
return {
|
||||||
"title": title,
|
"title": title,
|
||||||
@@ -249,16 +278,20 @@ class LinkAnalyzerBot(discord.Client):
|
|||||||
fetch_result = fetch_url_content(url)
|
fetch_result = fetch_url_content(url)
|
||||||
title = fetch_result["title"]
|
title = fetch_result["title"]
|
||||||
|
|
||||||
# Analyze with gateway
|
# Analyze with gateway (disabled for now - no valid endpoint)
|
||||||
analysis = None
|
analysis = None
|
||||||
if fetch_result["status"] == "ok":
|
# if fetch_result["status"] == "ok":
|
||||||
analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
|
# analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
|
||||||
|
|
||||||
# Add to Tududi
|
# Add to Tududi
|
||||||
tududi_ok = add_to_tududi(title, url, link_type, analysis or "")
|
tududi_ok = add_to_tududi(title, url, link_type, analysis or "")
|
||||||
|
|
||||||
# Format response for Discord
|
# Format response for Discord
|
||||||
response_text = f"📌 **{link_type}**: {title}"
|
response_text = f"📌 **{link_type}**: {title}"
|
||||||
|
if fetch_result.get("description"):
|
||||||
|
# Add description if available
|
||||||
|
desc = fetch_result["description"][:150]
|
||||||
|
response_text += f"\n📝 {desc}"
|
||||||
if analysis:
|
if analysis:
|
||||||
# Truncate to 200 chars for Discord
|
# Truncate to 200 chars for Discord
|
||||||
summary = analysis[:200].split('\n')[0]
|
summary = analysis[:200].split('\n')[0]
|
||||||
|
|||||||
Reference in New Issue
Block a user