Feature: GitHub API for README extraction (better content for SPA sites)
This commit is contained in:
66
bot.py
66
bot.py
@@ -157,6 +157,48 @@ def fetch_url_content(url):
|
|||||||
logger.error(f" ❌ Error: {e}")
|
logger.error(f" ❌ Error: {e}")
|
||||||
return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
|
return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
|
||||||
|
|
||||||
|
# Get GitHub repo info from API
|
||||||
|
def get_github_content(url):
|
||||||
|
"""Fetch GitHub repo README via API"""
|
||||||
|
logger.debug(f" Fetching GitHub repo info from API")
|
||||||
|
|
||||||
|
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
owner, repo = match.groups()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Try to get README
|
||||||
|
response = requests.get(
|
||||||
|
f"https://api.github.com/repos/{owner}/{repo}/readme",
|
||||||
|
headers={"Accept": "application/vnd.github.v3.raw"},
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
content = response.text[:2000]
|
||||||
|
logger.debug(f" Got README: {len(content)} chars")
|
||||||
|
return content
|
||||||
|
|
||||||
|
# Fallback: get repo info from API
|
||||||
|
response = requests.get(
|
||||||
|
f"https://api.github.com/repos/{owner}/{repo}",
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
desc = data.get("description", "")
|
||||||
|
if desc:
|
||||||
|
logger.debug(f" Got repo description: {desc}")
|
||||||
|
return desc
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" GitHub API error: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
# Extract clean text from HTML
|
# Extract clean text from HTML
|
||||||
def extract_text_from_html(html):
|
def extract_text_from_html(html):
|
||||||
"""Extract readable text from HTML"""
|
"""Extract readable text from HTML"""
|
||||||
@@ -180,6 +222,7 @@ def extract_text_from_html(html):
|
|||||||
# Remove remaining HTML tags but keep text
|
# Remove remaining HTML tags but keep text
|
||||||
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
||||||
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
|
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||||
|
text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||||
text = re.sub(r'</?[^>]+>', ' ', text)
|
text = re.sub(r'</?[^>]+>', ' ', text)
|
||||||
|
|
||||||
# Decode HTML entities
|
# Decode HTML entities
|
||||||
@@ -194,7 +237,7 @@ def extract_text_from_html(html):
|
|||||||
|
|
||||||
# Return first meaningful chunk
|
# Return first meaningful chunk
|
||||||
if len(text) < 100:
|
if len(text) < 100:
|
||||||
return "(Content not accessible)"
|
return None
|
||||||
|
|
||||||
return text[:2000] # First 2000 chars of clean text
|
return text[:2000] # First 2000 chars of clean text
|
||||||
|
|
||||||
@@ -204,9 +247,24 @@ def analyze_content(url, title, content, link_type):
|
|||||||
logger.debug(f" 🤖 Analyzing content: {url}")
|
logger.debug(f" 🤖 Analyzing content: {url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract clean text
|
# Special handling for GitHub
|
||||||
clean_text = extract_text_from_html(content)
|
clean_text = None
|
||||||
logger.debug(f" Extracted {len(clean_text)} chars of clean text")
|
if link_type == "GitHub":
|
||||||
|
clean_text = get_github_content(url)
|
||||||
|
|
||||||
|
# Fallback: extract from HTML
|
||||||
|
if not clean_text:
|
||||||
|
clean_text = extract_text_from_html(content)
|
||||||
|
|
||||||
|
if not clean_text:
|
||||||
|
logger.warning(f" Could not extract content")
|
||||||
|
return {
|
||||||
|
"summary": f"GitHub project: {title}",
|
||||||
|
"tag": "project",
|
||||||
|
"relevance": "relevant"
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(f" Extracted {len(clean_text)} chars of content")
|
||||||
|
|
||||||
# Build analysis prompt
|
# Build analysis prompt
|
||||||
prompt = f"""Analyze this webpage and create a brief summary for Laurent.
|
prompt = f"""Analyze this webpage and create a brief summary for Laurent.
|
||||||
|
|||||||
Reference in New Issue
Block a user