diff --git a/bot.py b/bot.py index e71d05b..4aff5ca 100644 --- a/bot.py +++ b/bot.py @@ -157,6 +157,48 @@ def fetch_url_content(url): logger.error(f" ❌ Error: {e}") return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""} +# Get GitHub repo info from API +def get_github_content(url): + """Fetch GitHub repo README via API""" + logger.debug(f" Fetching GitHub repo info from API") + + match = re.search(r'github\.com/([^/]+)/([^/]+)', url) + if not match: + return None + + owner, repo = match.groups() + + try: + # Try to get README + response = requests.get( + f"https://api.github.com/repos/{owner}/{repo}/readme", + headers={"Accept": "application/vnd.github.v3.raw"}, + timeout=5 + ) + + if response.status_code == 200: + content = response.text[:2000] + logger.debug(f" Got README: {len(content)} chars") + return content + + # Fallback: get repo info from API + response = requests.get( + f"https://api.github.com/repos/{owner}/{repo}", + timeout=5 + ) + + if response.status_code == 200: + data = response.json() + desc = data.get("description", "") + if desc: + logger.debug(f" Got repo description: {desc}") + return desc + + except Exception as e: + logger.warning(f" GitHub API error: {e}") + + return None + # Extract clean text from HTML def extract_text_from_html(html): """Extract readable text from HTML""" @@ -180,6 +222,7 @@ def extract_text_from_html(html): # Remove remaining HTML tags but keep text text = re.sub(r'', '\n', text, flags=re.IGNORECASE) text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) + text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) text = re.sub(r']+>', ' ', text) # Decode HTML entities @@ -194,7 +237,7 @@ def extract_text_from_html(html): # Return first meaningful chunk if len(text) < 100: - return "(Content not accessible)" + return None return text[:2000] # First 2000 chars of clean text @@ -204,9 +247,24 @@ def analyze_content(url, title, content, link_type): logger.debug(f" 🤖 Analyzing content: {url}") try: - # Extract clean text - clean_text = extract_text_from_html(content) - logger.debug(f" Extracted {len(clean_text)} chars of clean text") + # Special handling for GitHub + clean_text = None + if link_type == "GitHub": + clean_text = get_github_content(url) + + # Fallback: extract from HTML + if not clean_text: + clean_text = extract_text_from_html(content) + + if not clean_text: + logger.warning(f" Could not extract content") + return { + "summary": f"GitHub project: {title}", + "tag": "project", + "relevance": "relevant" + } + + logger.debug(f" Extracted {len(clean_text)} chars of content") # Build analysis prompt prompt = f"""Analyze this webpage and create a brief summary for Laurent.