Fix: Clean GitHub README markdown properly (remove images, code, links)

This commit is contained in:
Remora
2026-02-09 19:29:07 +01:00
parent 45a2ee8e1d
commit 76f564ae83

41
bot.py
View File

@@ -162,14 +162,15 @@ def get_github_content(url):
"""Fetch GitHub repo README via API"""
logger.debug(f" Fetching GitHub repo info from API")
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
if not match:
return None
owner, repo = match.groups()
owner, repo = match.groups()[:2]
repo = repo.rstrip('/')
try:
# Try to get README
# Try to get README as raw markdown
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}/readme",
headers={"Accept": "application/vnd.github.v3.raw"},
@@ -177,11 +178,20 @@ def get_github_content(url):
)
if response.status_code == 200:
content = response.text[:2000]
logger.debug(f" Got README: {len(content)} chars")
return content
content = response.text
# Clean markdown: remove images, code blocks, links
content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text
content = re.sub(r'#{1,6}\s+', '', content) # Remove headers
content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
if len(content) > 50:
logger.debug(f" Got README: {len(content)} chars after cleaning")
return content[:2000]
# Fallback: get repo info from API
# Fallback: get repo info JSON
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}",
timeout=5
@@ -189,10 +199,19 @@ def get_github_content(url):
if response.status_code == 200:
data = response.json()
desc = data.get("description", "")
if desc:
logger.debug(f" Got repo description: {desc}")
return desc
# Collect useful info
parts = []
if data.get("description"):
parts.append(data["description"])
if data.get("topics"):
parts.append(f"Topics: {', '.join(data['topics'][:3])}")
if data.get("language"):
parts.append(f"Language: {data['language']}")
content = " ".join(parts)
if content:
logger.debug(f" Got repo info: {len(content)} chars")
return content
except Exception as e:
logger.warning(f" GitHub API error: {e}")