Simplify: Send raw content directly to Haiku (let it handle parsing)

This commit is contained in:
Remora
2026-02-09 19:32:25 +01:00
parent 76f564ae83
commit e4984d607d

313
bot.py
View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI Discord bot for #remora channel - analyzes links in real-time with Haiku
Posts summaries, adds to Tududi inbox, maintains JSON history + logs Fetches content, sends to gateway for AI analysis, adds to Tududi inbox
""" """
import discord import discord
@@ -88,9 +88,9 @@ def detect_link_type(url):
else: else:
return "Article" return "Article"
# Fetch URL content using requests # Fetch URL content
def fetch_url_content(url): def fetch_url_content(url):
"""Fetch URL and return title + excerpt""" """Fetch URL and return content"""
logger.debug(f" 📥 Fetching: {url}") logger.debug(f" 📥 Fetching: {url}")
try: try:
@@ -104,229 +104,78 @@ def fetch_url_content(url):
allow_redirects=True allow_redirects=True
) )
response.raise_for_status() response.raise_for_status()
content = response.text[:4000] # First 4k chars content = response.text[:5000] # First 5k chars
# Try multiple patterns for title # Try to find title
title = None title = None
# Pattern 1: <title> tag
title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE) title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
if title_match: if title_match:
title = title_match.group(1).strip() title = title_match.group(1).strip()
# Pattern 2: og:title meta tag (for GitHub, etc.)
if not title: if not title:
og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE) og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
if og_match: if og_match:
title = og_match.group(1).strip() title = og_match.group(1).strip()
# Pattern 3: h1 tag (for GitHub README)
if not title:
h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
if h1_match:
title = h1_match.group(1).strip()
# Fallback
if not title: if not title:
title = url.split('/')[-1] or "Untitled" title = url.split('/')[-1] or "Untitled"
# Extract meta description
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
description = desc_match.group(1) if desc_match else ""
# Extract og:description
if not description:
og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
if og_desc:
description = og_desc.group(1)
logger.debug(f" ✓ Fetched: {title}") logger.debug(f" ✓ Fetched: {title}")
return { return {
"title": title, "title": title,
"description": description,
"content": content, "content": content,
"status": "ok" "status": "ok"
} }
except requests.Timeout: except requests.Timeout:
logger.warning(f" ⏱️ Timeout: {url}") logger.warning(f" ⏱️ Timeout: {url}")
return {"title": "Request timeout", "status": "timeout", "content": ""} return {"title": "Request timeout", "status": "timeout", "content": ""}
except requests.HTTPError as e:
logger.warning(f" ❌ HTTP {e.response.status_code}: {url}")
return {"title": f"HTTP {e.response.status_code}", "status": "http_error", "content": ""}
except Exception as e: except Exception as e:
logger.error(f" ❌ Error: {e}") logger.error(f" ❌ Error: {e}")
return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""} return {"title": "Fetch failed", "status": "error", "content": ""}
# Get GitHub repo info from API # Analyze with Haiku via gateway
def get_github_content(url):
"""Fetch GitHub repo README via API"""
logger.debug(f" Fetching GitHub repo info from API")
match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
if not match:
return None
owner, repo = match.groups()[:2]
repo = repo.rstrip('/')
try:
# Try to get README as raw markdown
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}/readme",
headers={"Accept": "application/vnd.github.v3.raw"},
timeout=5
)
if response.status_code == 200:
content = response.text
# Clean markdown: remove images, code blocks, links
content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text
content = re.sub(r'#{1,6}\s+', '', content) # Remove headers
content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
if len(content) > 50:
logger.debug(f" Got README: {len(content)} chars after cleaning")
return content[:2000]
# Fallback: get repo info JSON
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}",
timeout=5
)
if response.status_code == 200:
data = response.json()
# Collect useful info
parts = []
if data.get("description"):
parts.append(data["description"])
if data.get("topics"):
parts.append(f"Topics: {', '.join(data['topics'][:3])}")
if data.get("language"):
parts.append(f"Language: {data['language']}")
content = " ".join(parts)
if content:
logger.debug(f" Got repo info: {len(content)} chars")
return content
except Exception as e:
logger.warning(f" GitHub API error: {e}")
return None
# Extract clean text from HTML
def extract_text_from_html(html):
"""Extract readable text from HTML"""
# Remove DOCTYPE, comments
text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
# Remove scripts, styles, noscript
text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Try to extract main content areas first
main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
if main_match:
text = main_match.group(0)
# Remove common nav/footer/sidebar patterns
text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Remove remaining HTML tags but keep text
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'</?[^>]+>', ' ', text)
# Decode HTML entities
text = text.replace('&nbsp;', ' ')
text = text.replace('&lt;', '<')
text = text.replace('&gt;', '>')
text = text.replace('&amp;', '&')
# Clean up whitespace
text = re.sub(r'\s+', ' ', text)
text = text.strip()
# Return first meaningful chunk
if len(text) < 100:
return None
return text[:2000] # First 2000 chars of clean text
# Analyze content with Haiku via gateway
def analyze_content(url, title, content, link_type): def analyze_content(url, title, content, link_type):
"""Analyze content with AI to create intelligent summary""" """Send raw content to Haiku for analysis"""
logger.debug(f" 🤖 Analyzing content: {url}") logger.debug(f" 🤖 Analyzing with Haiku: {url}")
try: try:
# Special handling for GitHub # Simple prompt - send raw content
clean_text = None prompt = f"""Analyze this webpage content quickly.
if link_type == "GitHub":
clean_text = get_github_content(url)
# Fallback: extract from HTML
if not clean_text:
clean_text = extract_text_from_html(content)
if not clean_text:
logger.warning(f" Could not extract content")
return {
"summary": f"GitHub project: {title}",
"tag": "project",
"relevance": "relevant"
}
logger.debug(f" Extracted {len(clean_text)} chars of content")
# Build analysis prompt
prompt = f"""Analyze this webpage and create a brief summary for Laurent.
**Title**: {title}
**URL**: {url} **URL**: {url}
**Link Type**: {link_type} **Title**: {title}
**Content** (first 1500 chars): **RAW PAGE CONTENT** (first 3000 chars):
{clean_text[:1500]} {content[:3000]}
--- ---
Create a 2-3 sentence summary that answers: Write a 2-3 sentence summary explaining:
1. What is this page about? 1. What is this about?
2. Why would Laurent find it useful? 2. Why would this be useful?
Keep it practical and concise. Do NOT include the URL or title in the summary. Be concise. Skip marketing. No URLs or titles in summary."""
"""
# Call gateway with a simple POST logger.debug(f" Sending to Haiku...")
logger.debug(f" Sending to gateway for analysis...")
response = requests.post( response = requests.post(
"http://127.0.0.1:18789/sessions/turn", f"{GATEWAY_URL}/sessions/turn",
json={ json={
"message": prompt, "message": prompt,
"session": "main" "session": "main"
}, },
timeout=15, timeout=20,
headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {} headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {}
) )
if response.status_code == 200: if response.status_code == 200:
result = response.json() result = response.json()
# Extract the summary from response summary = result.get("message", "")
summary = result.get("message", "") or result.get("content", "") if isinstance(summary, list) and summary:
if isinstance(summary, list): summary = summary[0].get("text", "") if isinstance(summary[0], dict) else summary[0]
summary = summary[0].get("text", "") if summary else "" summary = str(summary).strip()[:300]
summary = summary.strip()[:300]
logger.info(f" ✓ Got summary from gateway: {summary[:60]}") logger.info(f" ✓ Got analysis: {summary[:50]}")
# Determine tag from link type
tag = "to-read" tag = "to-read"
if link_type == "GitHub": if link_type == "GitHub":
tag = "project" tag = "project"
@@ -341,61 +190,19 @@ Keep it practical and concise. Do NOT include the URL or title in the summary.
return { return {
"summary": summary, "summary": summary,
"tag": tag, "tag": tag
"relevance": "relevant"
} }
else: else:
logger.warning(f" Gateway error {response.status_code}, falling back to heuristic") logger.warning(f" Gateway error: {response.status_code}")
# Fallback: use simple heuristic return None
return {
"summary": extract_simple_summary(clean_text, title, link_type),
"tag": get_tag_from_type(link_type),
"relevance": "relevant"
}
except requests.Timeout:
logger.warning(f" Gateway timeout, using fallback")
return {
"summary": extract_simple_summary(content, title, link_type),
"tag": get_tag_from_type(link_type),
"relevance": "relevant"
}
except Exception as e: except Exception as e:
logger.error(f" Analysis error: {e}") logger.error(f" Analysis error: {e}")
import traceback return None
logger.error(traceback.format_exc())
return {
"summary": title,
"tag": "interesting",
"relevance": "relevant"
}
def extract_simple_summary(text, title, link_type):
"""Fallback: extract a simple summary from text"""
# Get first non-empty sentence/paragraph
sentences = re.split(r'[.!?]', text)
for sent in sentences:
sent = sent.strip()
if len(sent) > 20 and len(sent) < 300:
return sent[:200]
return title
def get_tag_from_type(link_type):
"""Get tag based on link type"""
tags = {
"GitHub": "project",
"YouTube": "video",
"Reddit": "discussion",
"Medium": "article",
"Dev.to": "article",
"arXiv": "learning",
"Twitter/X": "discussion"
}
return tags.get(link_type, "to-read")
# Send to Tududi inbox # Send to Tududi inbox
def add_to_tududi(title, url, link_type, summary="", tag=""): def add_to_tududi(title, url, link_type, summary="", tag=""):
"""Add to Tududi inbox with intelligent summary""" """Add to Tududi inbox"""
logger.debug(f" 📌 Adding to Tududi: {title}") logger.debug(f" 📌 Adding to Tududi: {title}")
try: try:
@@ -403,14 +210,11 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
logger.warning(" TUDUDI_API_KEY not set") logger.warning(" TUDUDI_API_KEY not set")
return False return False
# Format the inbox content
content = f"📌 **{link_type}**: {title}\n🔗 {url}" content = f"📌 **{link_type}**: {title}\n🔗 {url}"
if summary: if summary:
content += f"\n\n💡 **Summary**:\n{summary}" content += f"\n\n💡 {summary}"
if tag: if tag:
content += f"\n\n🏷️ **Tag**: {tag}" content += f"\n\n🏷️ {tag}"
response = requests.post( response = requests.post(
f"{TUDUDI_API_URL}/inbox", f"{TUDUDI_API_URL}/inbox",
@@ -422,8 +226,8 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
timeout=5 timeout=5
) )
if response.status_code in [200, 201]: # 200 or 201 are both OK if response.status_code in [200, 201]:
logger.info(f" ✓ Added to Tududi inbox with tag: {tag}") logger.info(f" ✓ Added to Tududi")
return True return True
else: else:
logger.warning(f" Tududi error: {response.status_code}") logger.warning(f" Tududi error: {response.status_code}")
@@ -474,38 +278,31 @@ class LinkAnalyzerBot(discord.Client):
fetch_result = fetch_url_content(url) fetch_result = fetch_url_content(url)
title = fetch_result["title"] title = fetch_result["title"]
# Analyze content if fetch was successful # Analyze with Haiku
analysis_data = None analysis_data = None
logger.debug(f" 📊 Fetch status: {fetch_result['status']}")
if fetch_result["status"] == "ok": if fetch_result["status"] == "ok":
logger.debug(f" 🔍 Starting analysis...") logger.debug(f" Analyzing...")
analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type) analysis_data = analyze_content(url, title, fetch_result["content"], link_type)
logger.debug(f" Analysis result: {analysis_data}")
else:
logger.debug(f" ⚠️ Fetch failed, skipping analysis")
# Prepare summary for Tududi # Prepare summary
summary_text = "" summary_text = ""
tag = "interesting" tag = "interesting"
if analysis_data: if analysis_data:
summary_text = analysis_data.get("summary", "") summary_text = analysis_data.get("summary", "")
tag = analysis_data.get("tag", "interesting") tag = analysis_data.get("tag", "interesting")
logger.debug(f" ✓ Got summary: {summary_text[:80]}") logger.debug(f" Summary: {summary_text[:60]}")
else:
logger.warning(f" ❌ No analysis data returned")
# Add to Tududi with summary # Add to Tududi
tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag) add_to_tududi(title, url, link_type, summary_text, tag)
# Format response for Discord # Format response
response_text = f"📌 **{link_type}**: {title}" response_text = f"📌 **{link_type}**: {title}"
if summary_text: if summary_text:
response_text += f"\n\n💡 {summary_text}" response_text += f"\n\n💡 {summary_text}"
if tag: if tag:
response_text += f"\n\n🏷️ Tag: `{tag}`" response_text += f"\n\n🏷️ `{tag}`"
logger.debug(f"Posting response: {response_text}") logger.debug(f"Posting response...")
# Post in channel # Post in channel
await message.reply(response_text, mention_author=False) await message.reply(response_text, mention_author=False)
@@ -518,21 +315,25 @@ class LinkAnalyzerBot(discord.Client):
"author": str(message.author), "author": str(message.author),
"message_id": message.id, "message_id": message.id,
"date": datetime.now().isoformat(), "date": datetime.now().isoformat(),
"analysis": analysis_data, "summary": summary_text,
"tududi": tududi_ok, "tag": tag
"fetch_status": fetch_result["status"]
}) })
logger.info(f"✓ Processed: {url}") logger.info(f"✓ Processed: {url}")
except Exception as e: except Exception as e:
logger.error(f"❌ Error processing {url}: {e}") logger.error(f"❌ Error: {e}")
await message.reply(f"❌ Error analyzing link: {e}", mention_author=False) import traceback
logger.error(traceback.format_exc())
try:
await message.reply(f"❌ Error: {str(e)[:100]}", mention_author=False)
except:
pass
# Update processed IDs # Update tracker
tracker["processed_message_ids"].append(message.id) tracker["processed_message_ids"].append(message.id)
save_tracker(tracker) save_tracker(tracker)
logger.info(f"Updated tracker, total links: {len(tracker['links'])}") logger.info(f"Updated tracker: {len(tracker['links'])} links total")
# Main # Main
if __name__ == "__main__": if __name__ == "__main__":