Simplify: Send raw content directly to Haiku (let it handle parsing)

This commit is contained in:
Remora
2026-02-09 19:32:25 +01:00
parent 76f564ae83
commit e4984d607d

313
bot.py
View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI
Posts summaries, adds to Tududi inbox, maintains JSON history + logs
Discord bot for #remora channel - analyzes links in real-time with Haiku
Fetches content, sends to gateway for AI analysis, adds to Tududi inbox
"""
import discord
@@ -88,9 +88,9 @@ def detect_link_type(url):
else:
return "Article"
# Fetch URL content using requests
# Fetch URL content
def fetch_url_content(url):
"""Fetch URL and return title + excerpt"""
"""Fetch URL and return content"""
logger.debug(f" 📥 Fetching: {url}")
try:
@@ -104,229 +104,78 @@ def fetch_url_content(url):
allow_redirects=True
)
response.raise_for_status()
content = response.text[:4000] # First 4k chars
content = response.text[:5000] # First 5k chars
# Try multiple patterns for title
# Try to find title
title = None
# Pattern 1: <title> tag
title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Pattern 2: og:title meta tag (for GitHub, etc.)
if not title:
og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
if og_match:
title = og_match.group(1).strip()
# Pattern 3: h1 tag (for GitHub README)
if not title:
h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
if h1_match:
title = h1_match.group(1).strip()
# Fallback
if not title:
title = url.split('/')[-1] or "Untitled"
# Extract meta description
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
description = desc_match.group(1) if desc_match else ""
# Extract og:description
if not description:
og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
if og_desc:
description = og_desc.group(1)
logger.debug(f" ✓ Fetched: {title}")
return {
"title": title,
"description": description,
"content": content,
"status": "ok"
}
except requests.Timeout:
logger.warning(f" ⏱️ Timeout: {url}")
return {"title": "Request timeout", "status": "timeout", "content": ""}
except requests.HTTPError as e:
logger.warning(f" ❌ HTTP {e.response.status_code}: {url}")
return {"title": f"HTTP {e.response.status_code}", "status": "http_error", "content": ""}
except Exception as e:
logger.error(f" ❌ Error: {e}")
return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
return {"title": "Fetch failed", "status": "error", "content": ""}
# Get GitHub repo info from API
def get_github_content(url):
"""Fetch GitHub repo README via API"""
logger.debug(f" Fetching GitHub repo info from API")
match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
if not match:
return None
owner, repo = match.groups()[:2]
repo = repo.rstrip('/')
try:
# Try to get README as raw markdown
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}/readme",
headers={"Accept": "application/vnd.github.v3.raw"},
timeout=5
)
if response.status_code == 200:
content = response.text
# Clean markdown: remove images, code blocks, links
content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text
content = re.sub(r'#{1,6}\s+', '', content) # Remove headers
content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
if len(content) > 50:
logger.debug(f" Got README: {len(content)} chars after cleaning")
return content[:2000]
# Fallback: get repo info JSON
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}",
timeout=5
)
if response.status_code == 200:
data = response.json()
# Collect useful info
parts = []
if data.get("description"):
parts.append(data["description"])
if data.get("topics"):
parts.append(f"Topics: {', '.join(data['topics'][:3])}")
if data.get("language"):
parts.append(f"Language: {data['language']}")
content = " ".join(parts)
if content:
logger.debug(f" Got repo info: {len(content)} chars")
return content
except Exception as e:
logger.warning(f" GitHub API error: {e}")
return None
# Extract clean text from HTML
def extract_text_from_html(html):
"""Extract readable text from HTML"""
# Remove DOCTYPE, comments
text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
# Remove scripts, styles, noscript
text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Try to extract main content areas first
main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
if main_match:
text = main_match.group(0)
# Remove common nav/footer/sidebar patterns
text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Remove remaining HTML tags but keep text
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'</?[^>]+>', ' ', text)
# Decode HTML entities
text = text.replace('&nbsp;', ' ')
text = text.replace('&lt;', '<')
text = text.replace('&gt;', '>')
text = text.replace('&amp;', '&')
# Clean up whitespace
text = re.sub(r'\s+', ' ', text)
text = text.strip()
# Return first meaningful chunk
if len(text) < 100:
return None
return text[:2000] # First 2000 chars of clean text
# Analyze content with Haiku via gateway
# Analyze with Haiku via gateway
def analyze_content(url, title, content, link_type):
"""Analyze content with AI to create intelligent summary"""
logger.debug(f" 🤖 Analyzing content: {url}")
"""Send raw content to Haiku for analysis"""
logger.debug(f" 🤖 Analyzing with Haiku: {url}")
try:
# Special handling for GitHub
clean_text = None
if link_type == "GitHub":
clean_text = get_github_content(url)
# Simple prompt - send raw content
prompt = f"""Analyze this webpage content quickly.
# Fallback: extract from HTML
if not clean_text:
clean_text = extract_text_from_html(content)
if not clean_text:
logger.warning(f" Could not extract content")
return {
"summary": f"GitHub project: {title}",
"tag": "project",
"relevance": "relevant"
}
logger.debug(f" Extracted {len(clean_text)} chars of content")
# Build analysis prompt
prompt = f"""Analyze this webpage and create a brief summary for Laurent.
**Title**: {title}
**URL**: {url}
**Link Type**: {link_type}
**Title**: {title}
**Content** (first 1500 chars):
{clean_text[:1500]}
**RAW PAGE CONTENT** (first 3000 chars):
{content[:3000]}
---
Create a 2-3 sentence summary that answers:
1. What is this page about?
2. Why would Laurent find it useful?
Write a 2-3 sentence summary explaining:
1. What is this about?
2. Why would this be useful?
Keep it practical and concise. Do NOT include the URL or title in the summary.
"""
Be concise. Skip marketing. No URLs or titles in summary."""
# Call gateway with a simple POST
logger.debug(f" Sending to gateway for analysis...")
logger.debug(f" Sending to Haiku...")
response = requests.post(
"http://127.0.0.1:18789/sessions/turn",
f"{GATEWAY_URL}/sessions/turn",
json={
"message": prompt,
"session": "main"
},
timeout=15,
timeout=20,
headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {}
)
if response.status_code == 200:
result = response.json()
# Extract the summary from response
summary = result.get("message", "") or result.get("content", "")
if isinstance(summary, list):
summary = summary[0].get("text", "") if summary else ""
summary = summary.strip()[:300]
summary = result.get("message", "")
if isinstance(summary, list) and summary:
summary = summary[0].get("text", "") if isinstance(summary[0], dict) else summary[0]
summary = str(summary).strip()[:300]
logger.info(f" ✓ Got summary from gateway: {summary[:60]}")
logger.info(f" ✓ Got analysis: {summary[:50]}")
# Determine tag from link type
tag = "to-read"
if link_type == "GitHub":
tag = "project"
@@ -341,61 +190,19 @@ Keep it practical and concise. Do NOT include the URL or title in the summary.
return {
"summary": summary,
"tag": tag,
"relevance": "relevant"
"tag": tag
}
else:
logger.warning(f" Gateway error {response.status_code}, falling back to heuristic")
# Fallback: use simple heuristic
return {
"summary": extract_simple_summary(clean_text, title, link_type),
"tag": get_tag_from_type(link_type),
"relevance": "relevant"
}
logger.warning(f" Gateway error: {response.status_code}")
return None
except requests.Timeout:
logger.warning(f" Gateway timeout, using fallback")
return {
"summary": extract_simple_summary(content, title, link_type),
"tag": get_tag_from_type(link_type),
"relevance": "relevant"
}
except Exception as e:
logger.error(f" Analysis error: {e}")
import traceback
logger.error(traceback.format_exc())
return {
"summary": title,
"tag": "interesting",
"relevance": "relevant"
}
def extract_simple_summary(text, title, link_type):
"""Fallback: extract a simple summary from text"""
# Get first non-empty sentence/paragraph
sentences = re.split(r'[.!?]', text)
for sent in sentences:
sent = sent.strip()
if len(sent) > 20 and len(sent) < 300:
return sent[:200]
return title
def get_tag_from_type(link_type):
"""Get tag based on link type"""
tags = {
"GitHub": "project",
"YouTube": "video",
"Reddit": "discussion",
"Medium": "article",
"Dev.to": "article",
"arXiv": "learning",
"Twitter/X": "discussion"
}
return tags.get(link_type, "to-read")
return None
# Send to Tududi inbox
def add_to_tududi(title, url, link_type, summary="", tag=""):
"""Add to Tududi inbox with intelligent summary"""
"""Add to Tududi inbox"""
logger.debug(f" 📌 Adding to Tududi: {title}")
try:
@@ -403,14 +210,11 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
logger.warning(" TUDUDI_API_KEY not set")
return False
# Format the inbox content
content = f"📌 **{link_type}**: {title}\n🔗 {url}"
if summary:
content += f"\n\n💡 **Summary**:\n{summary}"
content += f"\n\n💡 {summary}"
if tag:
content += f"\n\n🏷️ **Tag**: {tag}"
content += f"\n\n🏷️ {tag}"
response = requests.post(
f"{TUDUDI_API_URL}/inbox",
@@ -422,8 +226,8 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
timeout=5
)
if response.status_code in [200, 201]: # 200 or 201 are both OK
logger.info(f" ✓ Added to Tududi inbox with tag: {tag}")
if response.status_code in [200, 201]:
logger.info(f" ✓ Added to Tududi")
return True
else:
logger.warning(f" Tududi error: {response.status_code}")
@@ -474,38 +278,31 @@ class LinkAnalyzerBot(discord.Client):
fetch_result = fetch_url_content(url)
title = fetch_result["title"]
# Analyze content if fetch was successful
# Analyze with Haiku
analysis_data = None
logger.debug(f" 📊 Fetch status: {fetch_result['status']}")
if fetch_result["status"] == "ok":
logger.debug(f" 🔍 Starting analysis...")
analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type)
logger.debug(f" Analysis result: {analysis_data}")
else:
logger.debug(f" ⚠️ Fetch failed, skipping analysis")
logger.debug(f" Analyzing...")
analysis_data = analyze_content(url, title, fetch_result["content"], link_type)
# Prepare summary for Tududi
# Prepare summary
summary_text = ""
tag = "interesting"
if analysis_data:
summary_text = analysis_data.get("summary", "")
tag = analysis_data.get("tag", "interesting")
logger.debug(f" ✓ Got summary: {summary_text[:80]}")
else:
logger.warning(f" ❌ No analysis data returned")
logger.debug(f" Summary: {summary_text[:60]}")
# Add to Tududi with summary
tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag)
# Add to Tududi
add_to_tududi(title, url, link_type, summary_text, tag)
# Format response for Discord
# Format response
response_text = f"📌 **{link_type}**: {title}"
if summary_text:
response_text += f"\n\n💡 {summary_text}"
if tag:
response_text += f"\n\n🏷️ Tag: `{tag}`"
response_text += f"\n\n🏷️ `{tag}`"
logger.debug(f"Posting response: {response_text}")
logger.debug(f"Posting response...")
# Post in channel
await message.reply(response_text, mention_author=False)
@@ -518,21 +315,25 @@ class LinkAnalyzerBot(discord.Client):
"author": str(message.author),
"message_id": message.id,
"date": datetime.now().isoformat(),
"analysis": analysis_data,
"tududi": tududi_ok,
"fetch_status": fetch_result["status"]
"summary": summary_text,
"tag": tag
})
logger.info(f"✓ Processed: {url}")
except Exception as e:
logger.error(f"❌ Error processing {url}: {e}")
await message.reply(f"❌ Error analyzing link: {e}", mention_author=False)
logger.error(f"❌ Error: {e}")
import traceback
logger.error(traceback.format_exc())
try:
await message.reply(f"❌ Error: {str(e)[:100]}", mention_author=False)
except:
pass
# Update processed IDs
# Update tracker
tracker["processed_message_ids"].append(message.id)
save_tracker(tracker)
logger.info(f"Updated tracker, total links: {len(tracker['links'])}")
logger.info(f"Updated tracker: {len(tracker['links'])} links total")
# Main
if __name__ == "__main__":