Fix: Clean GitHub README markdown properly (remove images, code, links)
This commit is contained in:
41
bot.py
41
bot.py
@@ -162,14 +162,15 @@ def get_github_content(url):
|
||||
"""Fetch GitHub repo README via API"""
|
||||
logger.debug(f" Fetching GitHub repo info from API")
|
||||
|
||||
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
|
||||
match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
owner, repo = match.groups()
|
||||
owner, repo = match.groups()[:2]
|
||||
repo = repo.rstrip('/')
|
||||
|
||||
try:
|
||||
# Try to get README
|
||||
# Try to get README as raw markdown
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{owner}/{repo}/readme",
|
||||
headers={"Accept": "application/vnd.github.v3.raw"},
|
||||
@@ -177,11 +178,20 @@ def get_github_content(url):
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
content = response.text[:2000]
|
||||
logger.debug(f" Got README: {len(content)} chars")
|
||||
return content
|
||||
content = response.text
|
||||
# Clean markdown: remove images, code blocks, links
|
||||
content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images
|
||||
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks
|
||||
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text
|
||||
content = re.sub(r'#{1,6}\s+', '', content) # Remove headers
|
||||
content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules
|
||||
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
|
||||
|
||||
# Fallback: get repo info from API
|
||||
if len(content) > 50:
|
||||
logger.debug(f" Got README: {len(content)} chars after cleaning")
|
||||
return content[:2000]
|
||||
|
||||
# Fallback: get repo info JSON
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{owner}/{repo}",
|
||||
timeout=5
|
||||
@@ -189,10 +199,19 @@ def get_github_content(url):
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
desc = data.get("description", "")
|
||||
if desc:
|
||||
logger.debug(f" Got repo description: {desc}")
|
||||
return desc
|
||||
# Collect useful info
|
||||
parts = []
|
||||
if data.get("description"):
|
||||
parts.append(data["description"])
|
||||
if data.get("topics"):
|
||||
parts.append(f"Topics: {', '.join(data['topics'][:3])}")
|
||||
if data.get("language"):
|
||||
parts.append(f"Language: {data['language']}")
|
||||
|
||||
content = " ".join(parts)
|
||||
if content:
|
||||
logger.debug(f" Got repo info: {len(content)} chars")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" GitHub API error: {e}")
|
||||
|
||||
Reference in New Issue
Block a user