Feature: GitHub API for README extraction (better content for SPA sites)

2026-02-09 19:23:09 +01:00
parent 44a080cc13
commit 45a2ee8e1d
1 changed files with 62 additions and 4 deletions
@@ -157,6 +157,48 @@ def fetch_url_content(url):
        logger.error(f"    ❌ Error: {e}")
        return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
 # Get GitHub repo info from API
 def get_github_content(url):
    """Fetch GitHub repo README via API"""
    logger.debug(f"    Fetching GitHub repo info from API")
    match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
    if not match:
        return None
    owner, repo = match.groups()
    try:
        # Try to get README
        response = requests.get(
            f"https://api.github.com/repos/{owner}/{repo}/readme",
            headers={"Accept": "application/vnd.github.v3.raw"},
            timeout=5
        )
        if response.status_code == 200:
            content = response.text[:2000]
            logger.debug(f"    Got README: {len(content)} chars")
            return content
        # Fallback: get repo info from API
        response = requests.get(
            f"https://api.github.com/repos/{owner}/{repo}",
            timeout=5
        )
        if response.status_code == 200:
            data = response.json()
            desc = data.get("description", "")
            if desc:
                logger.debug(f"    Got repo description: {desc}")
                return desc
    except Exception as e:
        logger.warning(f"    GitHub API error: {e}")
    return None
 # Extract clean text from HTML
 def extract_text_from_html(html):
    """Extract readable text from HTML"""
@@ -180,6 +222,7 @@ def extract_text_from_html(html):
    # Remove remaining HTML tags but keep text
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</?[^>]+>', ' ', text)
    # Decode HTML entities
@@ -194,7 +237,7 @@ def extract_text_from_html(html):
    # Return first meaningful chunk
    if len(text) < 100:
-        return "(Content not accessible)"
+        return None
    return text[:2000]  # First 2000 chars of clean text
@@ -204,9 +247,24 @@ def analyze_content(url, title, content, link_type):
    logger.debug(f"  🤖 Analyzing content: {url}")
    try:
-        # Extract clean text
+        # Special handling for GitHub
-        clean_text = extract_text_from_html(content)
+        clean_text = None
-        logger.debug(f"    Extracted {len(clean_text)} chars of clean text")
+        if link_type == "GitHub":
            clean_text = get_github_content(url)
        # Fallback: extract from HTML
        if not clean_text:
            clean_text = extract_text_from_html(content)
        if not clean_text:
            logger.warning(f"    Could not extract content")
            return {
                "summary": f"GitHub project: {title}",
                "tag": "project",
                "relevance": "relevant"
            }
        logger.debug(f"    Extracted {len(clean_text)} chars of content")
        # Build analysis prompt
        prompt = f"""Analyze this webpage and create a brief summary for Laurent.