Feature: GitHub API for README extraction (better content for SPA sites)

This commit is contained in:
Remora
2026-02-09 19:23:09 +01:00
parent 44a080cc13
commit 45a2ee8e1d

64
bot.py
View File

@@ -157,6 +157,48 @@ def fetch_url_content(url):
logger.error(f" ❌ Error: {e}")
return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
# Get GitHub repo info from API
def get_github_content(url):
"""Fetch GitHub repo README via API"""
logger.debug(f" Fetching GitHub repo info from API")
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
if not match:
return None
owner, repo = match.groups()
try:
# Try to get README
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}/readme",
headers={"Accept": "application/vnd.github.v3.raw"},
timeout=5
)
if response.status_code == 200:
content = response.text[:2000]
logger.debug(f" Got README: {len(content)} chars")
return content
# Fallback: get repo info from API
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}",
timeout=5
)
if response.status_code == 200:
data = response.json()
desc = data.get("description", "")
if desc:
logger.debug(f" Got repo description: {desc}")
return desc
except Exception as e:
logger.warning(f" GitHub API error: {e}")
return None
# Extract clean text from HTML
def extract_text_from_html(html):
"""Extract readable text from HTML"""
@@ -180,6 +222,7 @@ def extract_text_from_html(html):
# Remove remaining HTML tags but keep text
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'</?[^>]+>', ' ', text)
# Decode HTML entities
@@ -194,7 +237,7 @@ def extract_text_from_html(html):
# Return first meaningful chunk
if len(text) < 100:
return "(Content not accessible)"
return None
return text[:2000] # First 2000 chars of clean text
@@ -204,9 +247,24 @@ def analyze_content(url, title, content, link_type):
logger.debug(f" 🤖 Analyzing content: {url}")
try:
# Extract clean text
# Special handling for GitHub
clean_text = None
if link_type == "GitHub":
clean_text = get_github_content(url)
# Fallback: extract from HTML
if not clean_text:
clean_text = extract_text_from_html(content)
logger.debug(f" Extracted {len(clean_text)} chars of clean text")
if not clean_text:
logger.warning(f" Could not extract content")
return {
"summary": f"GitHub project: {title}",
"tag": "project",
"relevance": "relevant"
}
logger.debug(f" Extracted {len(clean_text)} chars of content")
# Build analysis prompt
prompt = f"""Analyze this webpage and create a brief summary for Laurent.