Simplify: Send raw content directly to Haiku (let it handle parsing)
This commit is contained in:
313
bot.py
313
bot.py
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI
|
||||
Posts summaries, adds to Tududi inbox, maintains JSON history + logs
|
||||
Discord bot for #remora channel - analyzes links in real-time with Haiku
|
||||
Fetches content, sends to gateway for AI analysis, adds to Tududi inbox
|
||||
"""
|
||||
|
||||
import discord
|
||||
@@ -88,9 +88,9 @@ def detect_link_type(url):
|
||||
else:
|
||||
return "Article"
|
||||
|
||||
# Fetch URL content using requests
|
||||
# Fetch URL content
|
||||
def fetch_url_content(url):
|
||||
"""Fetch URL and return title + excerpt"""
|
||||
"""Fetch URL and return content"""
|
||||
logger.debug(f" 📥 Fetching: {url}")
|
||||
|
||||
try:
|
||||
@@ -104,229 +104,78 @@ def fetch_url_content(url):
|
||||
allow_redirects=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
content = response.text[:4000] # First 4k chars
|
||||
content = response.text[:5000] # First 5k chars
|
||||
|
||||
# Try multiple patterns for title
|
||||
# Try to find title
|
||||
title = None
|
||||
|
||||
# Pattern 1: <title> tag
|
||||
title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
|
||||
# Pattern 2: og:title meta tag (for GitHub, etc.)
|
||||
if not title:
|
||||
og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||
if og_match:
|
||||
title = og_match.group(1).strip()
|
||||
|
||||
# Pattern 3: h1 tag (for GitHub README)
|
||||
if not title:
|
||||
h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
|
||||
if h1_match:
|
||||
title = h1_match.group(1).strip()
|
||||
|
||||
# Fallback
|
||||
if not title:
|
||||
title = url.split('/')[-1] or "Untitled"
|
||||
|
||||
# Extract meta description
|
||||
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||
description = desc_match.group(1) if desc_match else ""
|
||||
|
||||
# Extract og:description
|
||||
if not description:
|
||||
og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
|
||||
if og_desc:
|
||||
description = og_desc.group(1)
|
||||
|
||||
logger.debug(f" ✓ Fetched: {title}")
|
||||
return {
|
||||
"title": title,
|
||||
"description": description,
|
||||
"content": content,
|
||||
"status": "ok"
|
||||
}
|
||||
except requests.Timeout:
|
||||
logger.warning(f" ⏱️ Timeout: {url}")
|
||||
return {"title": "Request timeout", "status": "timeout", "content": ""}
|
||||
except requests.HTTPError as e:
|
||||
logger.warning(f" ❌ HTTP {e.response.status_code}: {url}")
|
||||
return {"title": f"HTTP {e.response.status_code}", "status": "http_error", "content": ""}
|
||||
except Exception as e:
|
||||
logger.error(f" ❌ Error: {e}")
|
||||
return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
|
||||
return {"title": "Fetch failed", "status": "error", "content": ""}
|
||||
|
||||
# Get GitHub repo info from API
|
||||
def get_github_content(url):
|
||||
"""Fetch GitHub repo README via API"""
|
||||
logger.debug(f" Fetching GitHub repo info from API")
|
||||
|
||||
match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
owner, repo = match.groups()[:2]
|
||||
repo = repo.rstrip('/')
|
||||
|
||||
try:
|
||||
# Try to get README as raw markdown
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{owner}/{repo}/readme",
|
||||
headers={"Accept": "application/vnd.github.v3.raw"},
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
content = response.text
|
||||
# Clean markdown: remove images, code blocks, links
|
||||
content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove images
|
||||
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # Remove code blocks
|
||||
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content) # Convert markdown links to text
|
||||
content = re.sub(r'#{1,6}\s+', '', content) # Remove headers
|
||||
content = re.sub(r'[*_-]{3,}', '', content) # Remove horizontal rules
|
||||
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
|
||||
|
||||
if len(content) > 50:
|
||||
logger.debug(f" Got README: {len(content)} chars after cleaning")
|
||||
return content[:2000]
|
||||
|
||||
# Fallback: get repo info JSON
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{owner}/{repo}",
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
# Collect useful info
|
||||
parts = []
|
||||
if data.get("description"):
|
||||
parts.append(data["description"])
|
||||
if data.get("topics"):
|
||||
parts.append(f"Topics: {', '.join(data['topics'][:3])}")
|
||||
if data.get("language"):
|
||||
parts.append(f"Language: {data['language']}")
|
||||
|
||||
content = " ".join(parts)
|
||||
if content:
|
||||
logger.debug(f" Got repo info: {len(content)} chars")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" GitHub API error: {e}")
|
||||
|
||||
return None
|
||||
|
||||
# Extract clean text from HTML
|
||||
def extract_text_from_html(html):
|
||||
"""Extract readable text from HTML"""
|
||||
# Remove DOCTYPE, comments
|
||||
text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
|
||||
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
|
||||
|
||||
# Remove scripts, styles, noscript
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# Try to extract main content areas first
|
||||
main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
if main_match:
|
||||
text = main_match.group(0)
|
||||
|
||||
# Remove common nav/footer/sidebar patterns
|
||||
text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# Remove remaining HTML tags but keep text
|
||||
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'</?[^>]+>', ' ', text)
|
||||
|
||||
# Decode HTML entities
|
||||
text = text.replace(' ', ' ')
|
||||
text = text.replace('<', '<')
|
||||
text = text.replace('>', '>')
|
||||
text = text.replace('&', '&')
|
||||
|
||||
# Clean up whitespace
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = text.strip()
|
||||
|
||||
# Return first meaningful chunk
|
||||
if len(text) < 100:
|
||||
return None
|
||||
|
||||
return text[:2000] # First 2000 chars of clean text
|
||||
|
||||
# Analyze content with Haiku via gateway
|
||||
# Analyze with Haiku via gateway
|
||||
def analyze_content(url, title, content, link_type):
|
||||
"""Analyze content with AI to create intelligent summary"""
|
||||
logger.debug(f" 🤖 Analyzing content: {url}")
|
||||
"""Send raw content to Haiku for analysis"""
|
||||
logger.debug(f" 🤖 Analyzing with Haiku: {url}")
|
||||
|
||||
try:
|
||||
# Special handling for GitHub
|
||||
clean_text = None
|
||||
if link_type == "GitHub":
|
||||
clean_text = get_github_content(url)
|
||||
|
||||
# Fallback: extract from HTML
|
||||
if not clean_text:
|
||||
clean_text = extract_text_from_html(content)
|
||||
|
||||
if not clean_text:
|
||||
logger.warning(f" Could not extract content")
|
||||
return {
|
||||
"summary": f"GitHub project: {title}",
|
||||
"tag": "project",
|
||||
"relevance": "relevant"
|
||||
}
|
||||
|
||||
logger.debug(f" Extracted {len(clean_text)} chars of content")
|
||||
|
||||
# Build analysis prompt
|
||||
prompt = f"""Analyze this webpage and create a brief summary for Laurent.
|
||||
# Simple prompt - send raw content
|
||||
prompt = f"""Analyze this webpage content quickly.
|
||||
|
||||
**Title**: {title}
|
||||
**URL**: {url}
|
||||
**Link Type**: {link_type}
|
||||
**Title**: {title}
|
||||
|
||||
**Content** (first 1500 chars):
|
||||
{clean_text[:1500]}
|
||||
**RAW PAGE CONTENT** (first 3000 chars):
|
||||
{content[:3000]}
|
||||
|
||||
---
|
||||
|
||||
Create a 2-3 sentence summary that answers:
|
||||
1. What is this page about?
|
||||
2. Why would Laurent find it useful?
|
||||
Write a 2-3 sentence summary explaining:
|
||||
1. What is this about?
|
||||
2. Why would this be useful?
|
||||
|
||||
Keep it practical and concise. Do NOT include the URL or title in the summary.
|
||||
"""
|
||||
Be concise. Skip marketing. No URLs or titles in summary."""
|
||||
|
||||
# Call gateway with a simple POST
|
||||
logger.debug(f" Sending to gateway for analysis...")
|
||||
logger.debug(f" Sending to Haiku...")
|
||||
response = requests.post(
|
||||
"http://127.0.0.1:18789/sessions/turn",
|
||||
f"{GATEWAY_URL}/sessions/turn",
|
||||
json={
|
||||
"message": prompt,
|
||||
"session": "main"
|
||||
},
|
||||
timeout=15,
|
||||
timeout=20,
|
||||
headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
# Extract the summary from response
|
||||
summary = result.get("message", "") or result.get("content", "")
|
||||
if isinstance(summary, list):
|
||||
summary = summary[0].get("text", "") if summary else ""
|
||||
summary = summary.strip()[:300]
|
||||
summary = result.get("message", "")
|
||||
if isinstance(summary, list) and summary:
|
||||
summary = summary[0].get("text", "") if isinstance(summary[0], dict) else summary[0]
|
||||
summary = str(summary).strip()[:300]
|
||||
|
||||
logger.info(f" ✓ Got summary from gateway: {summary[:60]}")
|
||||
logger.info(f" ✓ Got analysis: {summary[:50]}")
|
||||
|
||||
# Determine tag from link type
|
||||
tag = "to-read"
|
||||
if link_type == "GitHub":
|
||||
tag = "project"
|
||||
@@ -341,61 +190,19 @@ Keep it practical and concise. Do NOT include the URL or title in the summary.
|
||||
|
||||
return {
|
||||
"summary": summary,
|
||||
"tag": tag,
|
||||
"relevance": "relevant"
|
||||
"tag": tag
|
||||
}
|
||||
else:
|
||||
logger.warning(f" Gateway error {response.status_code}, falling back to heuristic")
|
||||
# Fallback: use simple heuristic
|
||||
return {
|
||||
"summary": extract_simple_summary(clean_text, title, link_type),
|
||||
"tag": get_tag_from_type(link_type),
|
||||
"relevance": "relevant"
|
||||
}
|
||||
logger.warning(f" Gateway error: {response.status_code}")
|
||||
return None
|
||||
|
||||
except requests.Timeout:
|
||||
logger.warning(f" Gateway timeout, using fallback")
|
||||
return {
|
||||
"summary": extract_simple_summary(content, title, link_type),
|
||||
"tag": get_tag_from_type(link_type),
|
||||
"relevance": "relevant"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f" Analysis error: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return {
|
||||
"summary": title,
|
||||
"tag": "interesting",
|
||||
"relevance": "relevant"
|
||||
}
|
||||
|
||||
def extract_simple_summary(text, title, link_type):
|
||||
"""Fallback: extract a simple summary from text"""
|
||||
# Get first non-empty sentence/paragraph
|
||||
sentences = re.split(r'[.!?]', text)
|
||||
for sent in sentences:
|
||||
sent = sent.strip()
|
||||
if len(sent) > 20 and len(sent) < 300:
|
||||
return sent[:200]
|
||||
return title
|
||||
|
||||
def get_tag_from_type(link_type):
|
||||
"""Get tag based on link type"""
|
||||
tags = {
|
||||
"GitHub": "project",
|
||||
"YouTube": "video",
|
||||
"Reddit": "discussion",
|
||||
"Medium": "article",
|
||||
"Dev.to": "article",
|
||||
"arXiv": "learning",
|
||||
"Twitter/X": "discussion"
|
||||
}
|
||||
return tags.get(link_type, "to-read")
|
||||
return None
|
||||
|
||||
# Send to Tududi inbox
|
||||
def add_to_tududi(title, url, link_type, summary="", tag=""):
|
||||
"""Add to Tududi inbox with intelligent summary"""
|
||||
"""Add to Tududi inbox"""
|
||||
logger.debug(f" 📌 Adding to Tududi: {title}")
|
||||
|
||||
try:
|
||||
@@ -403,14 +210,11 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
|
||||
logger.warning(" TUDUDI_API_KEY not set")
|
||||
return False
|
||||
|
||||
# Format the inbox content
|
||||
content = f"📌 **{link_type}**: {title}\n🔗 {url}"
|
||||
|
||||
if summary:
|
||||
content += f"\n\n💡 **Summary**:\n{summary}"
|
||||
|
||||
content += f"\n\n💡 {summary}"
|
||||
if tag:
|
||||
content += f"\n\n🏷️ **Tag**: {tag}"
|
||||
content += f"\n\n🏷️ {tag}"
|
||||
|
||||
response = requests.post(
|
||||
f"{TUDUDI_API_URL}/inbox",
|
||||
@@ -422,8 +226,8 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if response.status_code in [200, 201]: # 200 or 201 are both OK
|
||||
logger.info(f" ✓ Added to Tududi inbox with tag: {tag}")
|
||||
if response.status_code in [200, 201]:
|
||||
logger.info(f" ✓ Added to Tududi")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f" Tududi error: {response.status_code}")
|
||||
@@ -474,38 +278,31 @@ class LinkAnalyzerBot(discord.Client):
|
||||
fetch_result = fetch_url_content(url)
|
||||
title = fetch_result["title"]
|
||||
|
||||
# Analyze content if fetch was successful
|
||||
# Analyze with Haiku
|
||||
analysis_data = None
|
||||
logger.debug(f" 📊 Fetch status: {fetch_result['status']}")
|
||||
|
||||
if fetch_result["status"] == "ok":
|
||||
logger.debug(f" 🔍 Starting analysis...")
|
||||
analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type)
|
||||
logger.debug(f" Analysis result: {analysis_data}")
|
||||
else:
|
||||
logger.debug(f" ⚠️ Fetch failed, skipping analysis")
|
||||
logger.debug(f" Analyzing...")
|
||||
analysis_data = analyze_content(url, title, fetch_result["content"], link_type)
|
||||
|
||||
# Prepare summary for Tududi
|
||||
# Prepare summary
|
||||
summary_text = ""
|
||||
tag = "interesting"
|
||||
if analysis_data:
|
||||
summary_text = analysis_data.get("summary", "")
|
||||
tag = analysis_data.get("tag", "interesting")
|
||||
logger.debug(f" ✓ Got summary: {summary_text[:80]}")
|
||||
else:
|
||||
logger.warning(f" ❌ No analysis data returned")
|
||||
logger.debug(f" Summary: {summary_text[:60]}")
|
||||
|
||||
# Add to Tududi with summary
|
||||
tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag)
|
||||
# Add to Tududi
|
||||
add_to_tududi(title, url, link_type, summary_text, tag)
|
||||
|
||||
# Format response for Discord
|
||||
# Format response
|
||||
response_text = f"📌 **{link_type}**: {title}"
|
||||
if summary_text:
|
||||
response_text += f"\n\n💡 {summary_text}"
|
||||
if tag:
|
||||
response_text += f"\n\n🏷️ Tag: `{tag}`"
|
||||
response_text += f"\n\n🏷️ `{tag}`"
|
||||
|
||||
logger.debug(f"Posting response: {response_text}")
|
||||
logger.debug(f"Posting response...")
|
||||
|
||||
# Post in channel
|
||||
await message.reply(response_text, mention_author=False)
|
||||
@@ -518,21 +315,25 @@ class LinkAnalyzerBot(discord.Client):
|
||||
"author": str(message.author),
|
||||
"message_id": message.id,
|
||||
"date": datetime.now().isoformat(),
|
||||
"analysis": analysis_data,
|
||||
"tududi": tududi_ok,
|
||||
"fetch_status": fetch_result["status"]
|
||||
"summary": summary_text,
|
||||
"tag": tag
|
||||
})
|
||||
|
||||
logger.info(f"✓ Processed: {url}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing {url}: {e}")
|
||||
await message.reply(f"❌ Error analyzing link: {e}", mention_author=False)
|
||||
logger.error(f"❌ Error: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
await message.reply(f"❌ Error: {str(e)[:100]}", mention_author=False)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Update processed IDs
|
||||
# Update tracker
|
||||
tracker["processed_message_ids"].append(message.id)
|
||||
save_tracker(tracker)
|
||||
logger.info(f"Updated tracker, total links: {len(tracker['links'])}")
|
||||
logger.info(f"Updated tracker: {len(tracker['links'])} links total")
|
||||
|
||||
# Main
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user