Fix: Clean GitHub README markdown properly (remove images, code, links)

This commit is contained in:
Remora
2026-02-09 19:29:07 +01:00
parent 45a2ee8e1d
commit 76f564ae83

41
bot.py
View File

@@ -162,14 +162,15 @@ def get_github_content(url):
"""Fetch GitHub repo README via API""" """Fetch GitHub repo README via API"""
logger.debug(f" Fetching GitHub repo info from API") logger.debug(f" Fetching GitHub repo info from API")
match = re.search(r'github\.com/([^/]+)/([^/]+)', url) match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
if not match: if not match:
return None return None
owner, repo = match.groups() owner, repo = match.groups()[:2]
repo = repo.rstrip('/')
try: try:
# Try to get README # Try to get README as raw markdown
response = requests.get( response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}/readme", f"https://api.github.com/repos/{owner}/{repo}/readme",
headers={"Accept": "application/vnd.github.v3.raw"}, headers={"Accept": "application/vnd.github.v3.raw"},
@@ -177,11 +178,20 @@ def get_github_content(url):
) )
if response.status_code == 200: if response.status_code == 200:
content = response.text[:2000] content = response.text
logger.debug(f" Got README: {len(content)} chars") # Clean markdown: remove images, code blocks, links
return content content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text
content = re.sub(r'#{1,6}\s+', '', content) # Remove headers
content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
if len(content) > 50:
logger.debug(f" Got README: {len(content)} chars after cleaning")
return content[:2000]
# Fallback: get repo info from API # Fallback: get repo info JSON
response = requests.get( response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}", f"https://api.github.com/repos/{owner}/{repo}",
timeout=5 timeout=5
@@ -189,10 +199,19 @@ def get_github_content(url):
if response.status_code == 200: if response.status_code == 200:
data = response.json() data = response.json()
desc = data.get("description", "") # Collect useful info
if desc: parts = []
logger.debug(f" Got repo description: {desc}") if data.get("description"):
return desc parts.append(data["description"])
if data.get("topics"):
parts.append(f"Topics: {', '.join(data['topics'][:3])}")
if data.get("language"):
parts.append(f"Language: {data['language']}")
content = " ".join(parts)
if content:
logger.debug(f" Got repo info: {len(content)} chars")
return content
except Exception as e: except Exception as e:
logger.warning(f" GitHub API error: {e}") logger.warning(f" GitHub API error: {e}")