Feature: GitHub API for README extraction (better content for SPA sites)
This commit is contained in:
64
bot.py
64
bot.py
@@ -157,6 +157,48 @@ def fetch_url_content(url):
|
||||
logger.error(f" ❌ Error: {e}")
|
||||
return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
|
||||
|
||||
# Get GitHub repo info from API
|
||||
def get_github_content(url):
|
||||
"""Fetch GitHub repo README via API"""
|
||||
logger.debug(f" Fetching GitHub repo info from API")
|
||||
|
||||
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
owner, repo = match.groups()
|
||||
|
||||
try:
|
||||
# Try to get README
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{owner}/{repo}/readme",
|
||||
headers={"Accept": "application/vnd.github.v3.raw"},
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
content = response.text[:2000]
|
||||
logger.debug(f" Got README: {len(content)} chars")
|
||||
return content
|
||||
|
||||
# Fallback: get repo info from API
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{owner}/{repo}",
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
desc = data.get("description", "")
|
||||
if desc:
|
||||
logger.debug(f" Got repo description: {desc}")
|
||||
return desc
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" GitHub API error: {e}")
|
||||
|
||||
return None
|
||||
|
||||
# Extract clean text from HTML
|
||||
def extract_text_from_html(html):
|
||||
"""Extract readable text from HTML"""
|
||||
@@ -180,6 +222,7 @@ def extract_text_from_html(html):
|
||||
# Remove remaining HTML tags but keep text
|
||||
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'</?[^>]+>', ' ', text)
|
||||
|
||||
# Decode HTML entities
|
||||
@@ -194,7 +237,7 @@ def extract_text_from_html(html):
|
||||
|
||||
# Return first meaningful chunk
|
||||
if len(text) < 100:
|
||||
return "(Content not accessible)"
|
||||
return None
|
||||
|
||||
return text[:2000] # First 2000 chars of clean text
|
||||
|
||||
@@ -204,9 +247,24 @@ def analyze_content(url, title, content, link_type):
|
||||
logger.debug(f" 🤖 Analyzing content: {url}")
|
||||
|
||||
try:
|
||||
# Extract clean text
|
||||
# Special handling for GitHub
|
||||
clean_text = None
|
||||
if link_type == "GitHub":
|
||||
clean_text = get_github_content(url)
|
||||
|
||||
# Fallback: extract from HTML
|
||||
if not clean_text:
|
||||
clean_text = extract_text_from_html(content)
|
||||
logger.debug(f" Extracted {len(clean_text)} chars of clean text")
|
||||
|
||||
if not clean_text:
|
||||
logger.warning(f" Could not extract content")
|
||||
return {
|
||||
"summary": f"GitHub project: {title}",
|
||||
"tag": "project",
|
||||
"relevance": "relevant"
|
||||
}
|
||||
|
||||
logger.debug(f" Extracted {len(clean_text)} chars of content")
|
||||
|
||||
# Build analysis prompt
|
||||
prompt = f"""Analyze this webpage and create a brief summary for Laurent.
|
||||
|
||||
Reference in New Issue
Block a user