From bec7f13d820e3f4796d6effd1486fdc518d1d810 Mon Sep 17 00:00:00 2001
From: Remora <remora@dilain.com>
Date: Mon, 9 Feb 2026 18:50:14 +0100
Subject: [PATCH] Improve: Better title extraction (og:title, h1), description
 parsing, disable gateway for now

---
 bot.py | 51 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 9 deletions(-)
diff --git a/bot.py b/bot.py
index a375a32..15b0540 100644
--- a/bot.py
+++ b/bot.py
@@ -96,21 +96,50 @@ def fetch_url_content(url):
     try:
         response = requests.get(
             url,
-            timeout=5,
-            headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'},
+            timeout=8,
+            headers={
+                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
+                'Accept': 'text/html,application/xhtml+xml'
+            },
             allow_redirects=True
         )
         response.raise_for_status()
-        content = response.text[:3000]  # First 3k chars
+        content = response.text[:4000]  # First 4k chars
         
-        # Extract title
-        title_match = re.search(r'<title[^>]*>([^<]+)</title>', content, re.IGNORECASE)
-        title = title_match.group(1).strip() if title_match else "No title found"
+        # Try multiple patterns for title
+        title = None
+        
+        # Pattern 1: <title> tag
+        title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
+        if title_match:
+            title = title_match.group(1).strip()
+        
+        # Pattern 2: og:title meta tag (for GitHub, etc.)
+        if not title:
+            og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
+            if og_match:
+                title = og_match.group(1).strip()
+        
+        # Pattern 3: h1 tag (for GitHub README)
+        if not title:
+            h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
+            if h1_match:
+                title = h1_match.group(1).strip()
+        
+        # Fallback
+        if not title:
+            title = url.split('/')[-1] or "Untitled"
         
         # Extract meta description
         desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
         description = desc_match.group(1) if desc_match else ""
         
+        # Extract og:description
+        if not description:
+            og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
+            if og_desc:
+                description = og_desc.group(1)
+        
         logger.debug(f"    ✓ Fetched: {title}")
         return {
             "title": title,
@@ -249,16 +278,20 @@ class LinkAnalyzerBot(discord.Client):
                 fetch_result = fetch_url_content(url)
                 title = fetch_result["title"]
                 
-                # Analyze with gateway
+                # Analyze with gateway (disabled for now - no valid endpoint)
                 analysis = None
-                if fetch_result["status"] == "ok":
-                    analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
+                # if fetch_result["status"] == "ok":
+                #     analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
                 
                 # Add to Tududi
                 tududi_ok = add_to_tududi(title, url, link_type, analysis or "")
                 
                 # Format response for Discord
                 response_text = f"📌 **{link_type}**: {title}"
+                if fetch_result.get("description"):
+                    # Add description if available
+                    desc = fetch_result["description"][:150]
+                    response_text += f"\n📝 {desc}"
                 if analysis:
                     # Truncate to 200 chars for Discord
                     summary = analysis[:200].split('\n')[0]