#!/usr/bin/env python3
"""
Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI
Posts summaries, adds to Tududi inbox, maintains JSON history + logs
"""
import discord
import os
import json
import re
import requests
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
import logging
from urllib.parse import urlparse
# Load .env file
load_dotenv()
# Setup logging
log_file = Path(__file__).parent / "bot.log"
logging.basicConfig(
level=logging.DEBUG,
format='[%(asctime)s] [%(levelname)-8s] %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Config
CHANNEL_ID = 1467557082583535729
TRACKER_FILE = Path(__file__).parent / "tracker.json"
TUDUDI_API_URL = os.getenv("TUDUDI_API_URL", "https://todo.dilain.com/api/v1")
TUDUDI_API_KEY = os.getenv("TUDUDI_API_KEY")
GATEWAY_URL = os.getenv("OPENCLAW_GATEWAY", "http://127.0.0.1:18789")
GATEWAY_TOKEN = os.getenv("OPENCLAW_GATEWAY_TOKEN", "")
logger.info("=" * 60)
logger.info("Bot startup")
logger.info(f" Channel ID: {CHANNEL_ID}")
logger.info(f" Tududi API: {TUDUDI_API_URL}")
logger.info(f" Gateway: {GATEWAY_URL}")
logger.info("=" * 60)
# Load or init tracker
def load_tracker():
if TRACKER_FILE.exists():
with open(TRACKER_FILE) as f:
return json.load(f)
return {
"channel_id": CHANNEL_ID,
"processed_message_ids": [],
"links": []
}
def save_tracker(data):
with open(TRACKER_FILE, "w") as f:
json.dump(data, f, indent=2)
# Detect links in text
def extract_urls(text):
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
return re.findall(url_pattern, text)
# Detect link type
def detect_link_type(url):
domain = urlparse(url).netloc.lower()
if "github.com" in domain:
return "GitHub"
elif "reddit.com" in domain:
return "Reddit"
elif "youtube.com" in domain or "youtu.be" in domain:
return "YouTube"
elif "tiktok.com" in domain:
return "TikTok"
elif "twitter.com" in domain or "x.com" in domain:
return "Twitter/X"
elif "medium.com" in domain:
return "Medium"
elif "dev.to" in domain:
return "Dev.to"
elif "arxiv.org" in domain:
return "arXiv"
else:
return "Article"
# Fetch URL content using requests
def fetch_url_content(url):
"""Fetch URL and return title + excerpt"""
logger.debug(f" š„ Fetching: {url}")
try:
response = requests.get(
url,
timeout=8,
headers={
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml'
},
allow_redirects=True
)
response.raise_for_status()
content = response.text[:4000] # First 4k chars
# Try multiple patterns for title
title = None
# Pattern 1:
tag
title_match = re.search(r']*>\s*([^<]+?)\s*', content, re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Pattern 2: og:title meta tag (for GitHub, etc.)
if not title:
og_match = re.search(r']*>([^<]+)', content, re.IGNORECASE)
if h1_match:
title = h1_match.group(1).strip()
# Fallback
if not title:
title = url.split('/')[-1] or "Untitled"
# Extract meta description
desc_match = re.search(r']*>', '', html, flags=re.IGNORECASE)
text = re.sub(r'', '', text, flags=re.DOTALL)
# Remove scripts, styles, noscript
text = re.sub(r'', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'', ' ', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Try to extract main content areas first
main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?\1>', text, flags=re.DOTALL | re.IGNORECASE)
if main_match:
text = main_match.group(0)
# Remove common nav/footer/sidebar patterns
text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
# Remove remaining HTML tags but keep text
text = re.sub(r' ', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'
]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'?[^>]+>', ' ', text)
# Decode HTML entities
text = text.replace(' ', ' ')
text = text.replace('<', '<')
text = text.replace('>', '>')
text = text.replace('&', '&')
# Clean up whitespace
text = re.sub(r'\s+', ' ', text)
text = text.strip()
# Return first meaningful chunk
if len(text) < 100:
return None
return text[:2000] # First 2000 chars of clean text
# Analyze content with Haiku via gateway
def analyze_content(url, title, content, link_type):
"""Analyze content with AI to create intelligent summary"""
logger.debug(f" š¤ Analyzing content: {url}")
try:
# Special handling for GitHub
clean_text = None
if link_type == "GitHub":
clean_text = get_github_content(url)
# Fallback: extract from HTML
if not clean_text:
clean_text = extract_text_from_html(content)
if not clean_text:
logger.warning(f" Could not extract content")
return {
"summary": f"GitHub project: {title}",
"tag": "project",
"relevance": "relevant"
}
logger.debug(f" Extracted {len(clean_text)} chars of content")
# Build analysis prompt
prompt = f"""Analyze this webpage and create a brief summary for Laurent.
**Title**: {title}
**URL**: {url}
**Link Type**: {link_type}
**Content** (first 1500 chars):
{clean_text[:1500]}
---
Create a 2-3 sentence summary that answers:
1. What is this page about?
2. Why would Laurent find it useful?
Keep it practical and concise. Do NOT include the URL or title in the summary.
"""
# Call gateway with a simple POST
logger.debug(f" Sending to gateway for analysis...")
response = requests.post(
"http://127.0.0.1:18789/sessions/turn",
json={
"message": prompt,
"session": "main"
},
timeout=15,
headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {}
)
if response.status_code == 200:
result = response.json()
# Extract the summary from response
summary = result.get("message", "") or result.get("content", "")
if isinstance(summary, list):
summary = summary[0].get("text", "") if summary else ""
summary = summary.strip()[:300]
logger.info(f" ā Got summary from gateway: {summary[:60]}")
# Determine tag from link type
tag = "to-read"
if link_type == "GitHub":
tag = "project"
elif link_type == "YouTube":
tag = "video"
elif link_type == "Reddit":
tag = "discussion"
elif link_type in ["Medium", "Dev.to"]:
tag = "article"
elif link_type == "arXiv":
tag = "learning"
return {
"summary": summary,
"tag": tag,
"relevance": "relevant"
}
else:
logger.warning(f" Gateway error {response.status_code}, falling back to heuristic")
# Fallback: use simple heuristic
return {
"summary": extract_simple_summary(clean_text, title, link_type),
"tag": get_tag_from_type(link_type),
"relevance": "relevant"
}
except requests.Timeout:
logger.warning(f" Gateway timeout, using fallback")
return {
"summary": extract_simple_summary(content, title, link_type),
"tag": get_tag_from_type(link_type),
"relevance": "relevant"
}
except Exception as e:
logger.error(f" Analysis error: {e}")
import traceback
logger.error(traceback.format_exc())
return {
"summary": title,
"tag": "interesting",
"relevance": "relevant"
}
def extract_simple_summary(text, title, link_type):
"""Fallback: extract a simple summary from text"""
# Get first non-empty sentence/paragraph
sentences = re.split(r'[.!?]', text)
for sent in sentences:
sent = sent.strip()
if len(sent) > 20 and len(sent) < 300:
return sent[:200]
return title
def get_tag_from_type(link_type):
"""Get tag based on link type"""
tags = {
"GitHub": "project",
"YouTube": "video",
"Reddit": "discussion",
"Medium": "article",
"Dev.to": "article",
"arXiv": "learning",
"Twitter/X": "discussion"
}
return tags.get(link_type, "to-read")
# Send to Tududi inbox
def add_to_tududi(title, url, link_type, summary="", tag=""):
"""Add to Tududi inbox with intelligent summary"""
logger.debug(f" š Adding to Tududi: {title}")
try:
if not TUDUDI_API_KEY:
logger.warning(" TUDUDI_API_KEY not set")
return False
# Format the inbox content
content = f"š **{link_type}**: {title}\nš {url}"
if summary:
content += f"\n\nš” **Summary**:\n{summary}"
if tag:
content += f"\n\nš·ļø **Tag**: {tag}"
response = requests.post(
f"{TUDUDI_API_URL}/inbox",
headers={
"Authorization": f"Bearer {TUDUDI_API_KEY}",
"Content-Type": "application/json"
},
json={"content": content},
timeout=5
)
if response.status_code in [200, 201]: # 200 or 201 are both OK
logger.info(f" ā Added to Tududi inbox with tag: {tag}")
return True
else:
logger.warning(f" Tududi error: {response.status_code}")
return False
except Exception as e:
logger.error(f" Tududi error: {e}")
return False
# Discord bot
intents = discord.Intents.default()
intents.message_content = True
class LinkAnalyzerBot(discord.Client):
async def on_ready(self):
logger.info(f"ā Bot logged in as {self.user}")
logger.info(f"š Watching channel #remora ({CHANNEL_ID})")
async def on_message(self, message):
# Ignore bot's own messages
if message.author == self.user:
return
# Only process #remora channel
if message.channel.id != CHANNEL_ID:
return
# Check for URLs
urls = extract_urls(message.content)
if not urls:
logger.debug(f"No URLs in message from {message.author}")
return
# Skip if already processed
tracker = load_tracker()
if message.id in tracker["processed_message_ids"]:
logger.debug(f"Skipping already-processed message {message.id}")
return
logger.info(f"š New link(s) from {message.author}: {message.content}")
# Process each URL
for url in urls:
try:
logger.info(f"Processing: {url}")
link_type = detect_link_type(url)
# Fetch content
fetch_result = fetch_url_content(url)
title = fetch_result["title"]
# Analyze content if fetch was successful
analysis_data = None
logger.debug(f" š Fetch status: {fetch_result['status']}")
if fetch_result["status"] == "ok":
logger.debug(f" š Starting analysis...")
analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type)
logger.debug(f" Analysis result: {analysis_data}")
else:
logger.debug(f" ā ļø Fetch failed, skipping analysis")
# Prepare summary for Tududi
summary_text = ""
tag = "interesting"
if analysis_data:
summary_text = analysis_data.get("summary", "")
tag = analysis_data.get("tag", "interesting")
logger.debug(f" ā Got summary: {summary_text[:80]}")
else:
logger.warning(f" ā No analysis data returned")
# Add to Tududi with summary
tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag)
# Format response for Discord
response_text = f"š **{link_type}**: {title}"
if summary_text:
response_text += f"\n\nš” {summary_text}"
if tag:
response_text += f"\n\nš·ļø Tag: `{tag}`"
logger.debug(f"Posting response: {response_text}")
# Post in channel
await message.reply(response_text, mention_author=False)
# Update tracker
tracker["links"].append({
"url": url,
"title": title,
"type": link_type,
"author": str(message.author),
"message_id": message.id,
"date": datetime.now().isoformat(),
"analysis": analysis_data,
"tududi": tududi_ok,
"fetch_status": fetch_result["status"]
})
logger.info(f"ā Processed: {url}")
except Exception as e:
logger.error(f"ā Error processing {url}: {e}")
await message.reply(f"ā Error analyzing link: {e}", mention_author=False)
# Update processed IDs
tracker["processed_message_ids"].append(message.id)
save_tracker(tracker)
logger.info(f"Updated tracker, total links: {len(tracker['links'])}")
# Main
if __name__ == "__main__":
token = os.getenv("DISCORD_BOT_TOKEN")
if not token:
logger.error("ā DISCORD_BOT_TOKEN not set!")
exit(1)
logger.info("Starting bot...")
bot = LinkAnalyzerBot(intents=intents)
bot.run(token)