Improve: Better title extraction (og:title, h1), description parsing, disable gateway for now

2026-02-09 18:50:14 +01:00
parent 03999875b5
commit bec7f13d82
1 changed files with 42 additions and 9 deletions
@@ -96,21 +96,50 @@ def fetch_url_content(url):
    try:
        response = requests.get(
            url,
-            timeout=5,
-            headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'},
+            timeout=8,
+            headers={
+                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
+                'Accept': 'text/html,application/xhtml+xml'
+            },
            allow_redirects=True
        )
        response.raise_for_status()
-        content = response.text[:3000]  # First 3k chars
+        content = response.text[:4000]  # First 4k chars
        
-        # Extract title
-        title_match = re.search(r'<title[^>]*>([^<]+)</title>', content, re.IGNORECASE)
-        title = title_match.group(1).strip() if title_match else "No title found"
+        # Try multiple patterns for title
+        title = None
+        
+        # Pattern 1: <title> tag
+        title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
+        if title_match:
+            title = title_match.group(1).strip()
+        
+        # Pattern 2: og:title meta tag (for GitHub, etc.)
+        if not title:
+            og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
+            if og_match:
+                title = og_match.group(1).strip()
+        
+        # Pattern 3: h1 tag (for GitHub README)
+        if not title:
+            h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
+            if h1_match:
+                title = h1_match.group(1).strip()
+        
+        # Fallback
+        if not title:
+            title = url.split('/')[-1] or "Untitled"
        
        # Extract meta description
        desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
        description = desc_match.group(1) if desc_match else ""
        
+        # Extract og:description
+        if not description:
+            og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
+            if og_desc:
+                description = og_desc.group(1)
+        
        logger.debug(f"    ✓ Fetched: {title}")
        return {
            "title": title,
@@ -249,16 +278,20 @@ class LinkAnalyzerBot(discord.Client):
                fetch_result = fetch_url_content(url)
                title = fetch_result["title"]
                
-                # Analyze with gateway
+                # Analyze with gateway (disabled for now - no valid endpoint)
                analysis = None
-                if fetch_result["status"] == "ok":
-                    analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
+                # if fetch_result["status"] == "ok":
+                #     analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
                
                # Add to Tududi
                tududi_ok = add_to_tududi(title, url, link_type, analysis or "")
                
                # Format response for Discord
                response_text = f"📌 **{link_type}**: {title}"
+                if fetch_result.get("description"):
+                    # Add description if available
+                    desc = fetch_result["description"][:150]
+                    response_text += f"\n📝 {desc}"
                if analysis:
                    # Truncate to 200 chars for Discord
                    summary = analysis[:200].split('\n')[0]