Improve: Better title extraction (og:title, h1), description parsing, disable gateway for now

2026-02-09 18:50:14 +01:00
parent 03999875b5
commit bec7f13d82
1 changed files with 42 additions and 9 deletions
@@ -96,21 +96,50 @@ def fetch_url_content(url):
    try:
        response = requests.get(
            url,
-            timeout=5,
+            timeout=8,
-            headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'},
+            headers={
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
                'Accept': 'text/html,application/xhtml+xml'
            },
            allow_redirects=True
        )
        response.raise_for_status()
-        content = response.text[:3000]  # First 3k chars
+        content = response.text[:4000]  # First 4k chars
-        # Extract title
+        # Try multiple patterns for title
-        title_match = re.search(r'<title[^>]*>([^<]+)</title>', content, re.IGNORECASE)
+        title = None
-        title = title_match.group(1).strip() if title_match else "No title found"
+        
        # Pattern 1: <title> tag
        title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
        if title_match:
            title = title_match.group(1).strip()
        # Pattern 2: og:title meta tag (for GitHub, etc.)
        if not title:
            og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
            if og_match:
                title = og_match.group(1).strip()
        # Pattern 3: h1 tag (for GitHub README)
        if not title:
            h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
            if h1_match:
                title = h1_match.group(1).strip()
        # Fallback
        if not title:
            title = url.split('/')[-1] or "Untitled"
        # Extract meta description
        desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
        description = desc_match.group(1) if desc_match else ""
        # Extract og:description
        if not description:
            og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
            if og_desc:
                description = og_desc.group(1)
        logger.debug(f"    ✓ Fetched: {title}")
        return {
            "title": title,
@@ -249,16 +278,20 @@ class LinkAnalyzerBot(discord.Client):
                fetch_result = fetch_url_content(url)
                title = fetch_result["title"]
-                # Analyze with gateway
+                # Analyze with gateway (disabled for now - no valid endpoint)
                analysis = None
-                if fetch_result["status"] == "ok":
+                # if fetch_result["status"] == "ok":
-                    analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
+                #     analysis = analyze_with_gateway(url, title, fetch_result.get("content", ""))
                # Add to Tududi
                tududi_ok = add_to_tududi(title, url, link_type, analysis or "")
                # Format response for Discord
                response_text = f"📌 **{link_type}**: {title}"
                if fetch_result.get("description"):
                    # Add description if available
                    desc = fetch_result["description"][:150]
                    response_text += f"\n📝 {desc}"
                if analysis:
                    # Truncate to 200 chars for Discord
                    summary = analysis[:200].split('\n')[0]