From bec7f13d820e3f4796d6effd1486fdc518d1d810 Mon Sep 17 00:00:00 2001 From: Remora Date: Mon, 9 Feb 2026 18:50:14 +0100 Subject: [PATCH] Improve: Better title extraction (og:title, h1), description parsing, disable gateway for now --- bot.py | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/bot.py b/bot.py index a375a32..15b0540 100644 --- a/bot.py +++ b/bot.py @@ -96,21 +96,50 @@ def fetch_url_content(url): try: response = requests.get( url, - timeout=5, - headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}, + timeout=8, + headers={ + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36', + 'Accept': 'text/html,application/xhtml+xml' + }, allow_redirects=True ) response.raise_for_status() - content = response.text[:3000] # First 3k chars + content = response.text[:4000] # First 4k chars - # Extract title - title_match = re.search(r']*>([^<]+)', content, re.IGNORECASE) - title = title_match.group(1).strip() if title_match else "No title found" + # Try multiple patterns for title + title = None + + # Pattern 1: tag + title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*', content, re.IGNORECASE) + if title_match: + title = title_match.group(1).strip() + + # Pattern 2: og:title meta tag (for GitHub, etc.) + if not title: + og_match = re.search(r']*>([^<]+)', content, re.IGNORECASE) + if h1_match: + title = h1_match.group(1).strip() + + # Fallback + if not title: + title = url.split('/')[-1] or "Untitled" # Extract meta description desc_match = re.search(r'