From 76f564ae83f9c47ebcf0e75b0b41d17039d62fbd Mon Sep 17 00:00:00 2001 From: Remora Date: Mon, 9 Feb 2026 19:29:07 +0100 Subject: [PATCH] Fix: Clean GitHub README markdown properly (remove images, code, links) --- bot.py | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/bot.py b/bot.py index 4aff5ca..46ca311 100644 --- a/bot.py +++ b/bot.py @@ -162,14 +162,15 @@ def get_github_content(url): """Fetch GitHub repo README via API""" logger.debug(f" Fetching GitHub repo info from API") - match = re.search(r'github\.com/([^/]+)/([^/]+)', url) + match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url) if not match: return None - owner, repo = match.groups() + owner, repo = match.groups()[:2] + repo = repo.rstrip('/') try: - # Try to get README + # Try to get README as raw markdown response = requests.get( f"https://api.github.com/repos/{owner}/{repo}/readme", headers={"Accept": "application/vnd.github.v3.raw"}, @@ -177,11 +178,20 @@ def get_github_content(url): ) if response.status_code == 200: - content = response.text[:2000] - logger.debug(f" Got README: {len(content)} chars") - return content + content = response.text + # Clean markdown: remove images, code blocks, links + content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images + content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks + content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text + content = re.sub(r'#{1,6}\s+', '', content) # Remove headers + content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules + content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace + + if len(content) > 50: + logger.debug(f" Got README: {len(content)} chars after cleaning") + return content[:2000] - # Fallback: get repo info from API + # Fallback: get repo info JSON response = requests.get( f"https://api.github.com/repos/{owner}/{repo}", timeout=5 @@ -189,10 +199,19 @@ def get_github_content(url): if response.status_code == 200: data = response.json() - desc = data.get("description", "") - if desc: - logger.debug(f" Got repo description: {desc}") - return desc + # Collect useful info + parts = [] + if data.get("description"): + parts.append(data["description"]) + if data.get("topics"): + parts.append(f"Topics: {', '.join(data['topics'][:3])}") + if data.get("language"): + parts.append(f"Language: {data['language']}") + + content = " ".join(parts) + if content: + logger.debug(f" Got repo info: {len(content)} chars") + return content except Exception as e: logger.warning(f" GitHub API error: {e}")