Fix: Clean GitHub README markdown properly (remove images, code, links)
This commit is contained in:
41
bot.py
41
bot.py
@@ -162,14 +162,15 @@ def get_github_content(url):
|
|||||||
"""Fetch GitHub repo README via API"""
|
"""Fetch GitHub repo README via API"""
|
||||||
logger.debug(f" Fetching GitHub repo info from API")
|
logger.debug(f" Fetching GitHub repo info from API")
|
||||||
|
|
||||||
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
|
match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
|
||||||
if not match:
|
if not match:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
owner, repo = match.groups()
|
owner, repo = match.groups()[:2]
|
||||||
|
repo = repo.rstrip('/')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try to get README
|
# Try to get README as raw markdown
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
f"https://api.github.com/repos/{owner}/{repo}/readme",
|
f"https://api.github.com/repos/{owner}/{repo}/readme",
|
||||||
headers={"Accept": "application/vnd.github.v3.raw"},
|
headers={"Accept": "application/vnd.github.v3.raw"},
|
||||||
@@ -177,11 +178,20 @@ def get_github_content(url):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
content = response.text[:2000]
|
content = response.text
|
||||||
logger.debug(f" Got README: {len(content)} chars")
|
# Clean markdown: remove images, code blocks, links
|
||||||
return content
|
content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images
|
||||||
|
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks
|
||||||
|
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text
|
||||||
|
content = re.sub(r'#{1,6}\s+', '', content) # Remove headers
|
||||||
|
content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules
|
||||||
|
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
|
||||||
|
|
||||||
# Fallback: get repo info from API
|
if len(content) > 50:
|
||||||
|
logger.debug(f" Got README: {len(content)} chars after cleaning")
|
||||||
|
return content[:2000]
|
||||||
|
|
||||||
|
# Fallback: get repo info JSON
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
f"https://api.github.com/repos/{owner}/{repo}",
|
f"https://api.github.com/repos/{owner}/{repo}",
|
||||||
timeout=5
|
timeout=5
|
||||||
@@ -189,10 +199,19 @@ def get_github_content(url):
|
|||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
desc = data.get("description", "")
|
# Collect useful info
|
||||||
if desc:
|
parts = []
|
||||||
logger.debug(f" Got repo description: {desc}")
|
if data.get("description"):
|
||||||
return desc
|
parts.append(data["description"])
|
||||||
|
if data.get("topics"):
|
||||||
|
parts.append(f"Topics: {', '.join(data['topics'][:3])}")
|
||||||
|
if data.get("language"):
|
||||||
|
parts.append(f"Language: {data['language']}")
|
||||||
|
|
||||||
|
content = " ".join(parts)
|
||||||
|
if content:
|
||||||
|
logger.debug(f" Got repo info: {len(content)} chars")
|
||||||
|
return content
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f" GitHub API error: {e}")
|
logger.warning(f" GitHub API error: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user