Feature: GitHub API for README extraction (better content for SPA sites)

2026-02-09 19:23:09 +01:00
parent 44a080cc13
commit 45a2ee8e1d
1 changed files with 62 additions and 4 deletions
@@ -157,6 +157,48 @@ def fetch_url_content(url):
        logger.error(f"    ❌ Error: {e}")
        return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}

+# Get GitHub repo info from API
+def get_github_content(url):
+    """Fetch GitHub repo README via API"""
+    logger.debug(f"    Fetching GitHub repo info from API")
+    
+    match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
+    if not match:
+        return None
+    
+    owner, repo = match.groups()
+    
+    try:
+        # Try to get README
+        response = requests.get(
+            f"https://api.github.com/repos/{owner}/{repo}/readme",
+            headers={"Accept": "application/vnd.github.v3.raw"},
+            timeout=5
+        )
+        
+        if response.status_code == 200:
+            content = response.text[:2000]
+            logger.debug(f"    Got README: {len(content)} chars")
+            return content
+        
+        # Fallback: get repo info from API
+        response = requests.get(
+            f"https://api.github.com/repos/{owner}/{repo}",
+            timeout=5
+        )
+        
+        if response.status_code == 200:
+            data = response.json()
+            desc = data.get("description", "")
+            if desc:
+                logger.debug(f"    Got repo description: {desc}")
+                return desc
+    
+    except Exception as e:
+        logger.warning(f"    GitHub API error: {e}")
+    
+    return None
+
 # Extract clean text from HTML
 def extract_text_from_html(html):
    """Extract readable text from HTML"""
@@ -180,6 +222,7 @@ def extract_text_from_html(html):
    # Remove remaining HTML tags but keep text
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</?[^>]+>', ' ', text)
    
    # Decode HTML entities
@@ -194,7 +237,7 @@ def extract_text_from_html(html):
    
    # Return first meaningful chunk
    if len(text) < 100:
-        return "(Content not accessible)"
+        return None
    
    return text[:2000]  # First 2000 chars of clean text

@@ -204,9 +247,24 @@ def analyze_content(url, title, content, link_type):
    logger.debug(f"  🤖 Analyzing content: {url}")
    
    try:
-        # Extract clean text
+        # Special handling for GitHub
+        clean_text = None
+        if link_type == "GitHub":
+            clean_text = get_github_content(url)
+        
+        # Fallback: extract from HTML
+        if not clean_text:
            clean_text = extract_text_from_html(content)
-        logger.debug(f"    Extracted {len(clean_text)} chars of clean text")
+        
+        if not clean_text:
+            logger.warning(f"    Could not extract content")
+            return {
+                "summary": f"GitHub project: {title}",
+                "tag": "project",
+                "relevance": "relevant"
+            }
+        
+        logger.debug(f"    Extracted {len(clean_text)} chars of content")
        
        # Build analysis prompt
        prompt = f"""Analyze this webpage and create a brief summary for Laurent.