tag title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*

#!/usr/bin/env python3 """ Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI Posts summaries, adds to Tududi inbox, maintains JSON history + logs """ import discord import os import json import re import requests from datetime import datetime from pathlib import Path from dotenv import load_dotenv import logging from urllib.parse import urlparse # Load .env file load_dotenv() # Setup logging log_file = Path(__file__).parent / "bot.log" logging.basicConfig( level=logging.DEBUG, format='[%(asctime)s] [%(levelname)-8s] %(message)s', handlers=[ logging.FileHandler(log_file), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) # Config CHANNEL_ID = 1467557082583535729 TRACKER_FILE = Path(__file__).parent / "tracker.json" TUDUDI_API_URL = os.getenv("TUDUDI_API_URL", "https://todo.dilain.com/api/v1") TUDUDI_API_KEY = os.getenv("TUDUDI_API_KEY") GATEWAY_URL = os.getenv("OPENCLAW_GATEWAY", "http://127.0.0.1:18789") GATEWAY_TOKEN = os.getenv("OPENCLAW_GATEWAY_TOKEN", "") logger.info("=" * 60) logger.info("Bot startup") logger.info(f" Channel ID: {CHANNEL_ID}") logger.info(f" Tududi API: {TUDUDI_API_URL}") logger.info(f" Gateway: {GATEWAY_URL}") logger.info("=" * 60) # Load or init tracker def load_tracker(): if TRACKER_FILE.exists(): with open(TRACKER_FILE) as f: return json.load(f) return { "channel_id": CHANNEL_ID, "processed_message_ids": [], "links": [] } def save_tracker(data): with open(TRACKER_FILE, "w") as f: json.dump(data, f, indent=2) # Detect links in text def extract_urls(text): url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' return re.findall(url_pattern, text) # Detect link type def detect_link_type(url): domain = urlparse(url).netloc.lower() if "github.com" in domain: return "GitHub" elif "reddit.com" in domain: return "Reddit" elif "youtube.com" in domain or "youtu.be" in domain: return "YouTube" elif "tiktok.com" in domain: return "TikTok" elif "twitter.com" in domain or "x.com" in domain: return "Twitter/X" elif "medium.com" in domain: return "Medium" elif "dev.to" in domain: return "Dev.to" elif "arxiv.org" in domain: return "arXiv" else: return "Article" # Fetch URL content using requests def fetch_url_content(url): """Fetch URL and return title + excerpt""" logger.debug(f" 📥 Fetching: {url}") try: response = requests.get( url, timeout=8, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' }, allow_redirects=True ) response.raise_for_status() content = response.text[:4000] # First 4k chars # Try multiple patterns for title title = None # Pattern 1: tag title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*', content, re.IGNORECASE) if title_match: title = title_match.group(1).strip() # Pattern 2: og:title meta tag (for GitHub, etc.) if not title: og_match = re.search(r']*>([^<]+)', content, re.IGNORECASE) if h1_match: title = h1_match.group(1).strip() # Fallback if not title: title = url.split('/')[-1] or "Untitled" # Extract meta description desc_match = re.search(r']*>', '', html, flags=re.IGNORECASE) text = re.sub(r'', '', text, flags=re.DOTALL) # Remove scripts, styles, noscript text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) # Try to extract main content areas first main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?', text, flags=re.DOTALL | re.IGNORECASE) if main_match: text = main_match.group(0) # Remove common nav/footer/sidebar patterns text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) # Remove remaining HTML tags but keep text text = re.sub(r'', '\n', text, flags=re.IGNORECASE) text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) text = re.sub(r']+>', ' ', text) # Decode HTML entities text = text.replace(' ', ' ') text = text.replace('<', '<') text = text.replace('>', '>') text = text.replace('&', '&') # Clean up whitespace text = re.sub(r'\s+', ' ', text) text = text.strip() # Return first meaningful chunk if len(text) < 100: return None return text[:2000] # First 2000 chars of clean text # Analyze content with Haiku via gateway def analyze_content(url, title, content, link_type): """Analyze content with AI to create intelligent summary""" logger.debug(f" 🤖 Analyzing content: {url}") try: # Special handling for GitHub clean_text = None if link_type == "GitHub": clean_text = get_github_content(url) # Fallback: extract from HTML if not clean_text: clean_text = extract_text_from_html(content) if not clean_text: logger.warning(f" Could not extract content") return { "summary": f"GitHub project: {title}", "tag": "project", "relevance": "relevant" } logger.debug(f" Extracted {len(clean_text)} chars of content") # Build analysis prompt prompt = f"""Analyze this webpage and create a brief summary for Laurent. **Title**: {title} **URL**: {url} **Link Type**: {link_type} **Content** (first 1500 chars): {clean_text[:1500]} --- Create a 2-3 sentence summary that answers: 1. What is this page about? 2. Why would Laurent find it useful? Keep it practical and concise. Do NOT include the URL or title in the summary. """ # Call gateway with a simple POST logger.debug(f" Sending to gateway for analysis...") response = requests.post( "http://127.0.0.1:18789/sessions/turn", json={ "message": prompt, "session": "main" }, timeout=15, headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {} ) if response.status_code == 200: result = response.json() # Extract the summary from response summary = result.get("message", "") or result.get("content", "") if isinstance(summary, list): summary = summary[0].get("text", "") if summary else "" summary = summary.strip()[:300] logger.info(f" ✓ Got summary from gateway: {summary[:60]}") # Determine tag from link type tag = "to-read" if link_type == "GitHub": tag = "project" elif link_type == "YouTube": tag = "video" elif link_type == "Reddit": tag = "discussion" elif link_type in ["Medium", "Dev.to"]: tag = "article" elif link_type == "arXiv": tag = "learning" return { "summary": summary, "tag": tag, "relevance": "relevant" } else: logger.warning(f" Gateway error {response.status_code}, falling back to heuristic") # Fallback: use simple heuristic return { "summary": extract_simple_summary(clean_text, title, link_type), "tag": get_tag_from_type(link_type), "relevance": "relevant" } except requests.Timeout: logger.warning(f" Gateway timeout, using fallback") return { "summary": extract_simple_summary(content, title, link_type), "tag": get_tag_from_type(link_type), "relevance": "relevant" } except Exception as e: logger.error(f" Analysis error: {e}") import traceback logger.error(traceback.format_exc()) return { "summary": title, "tag": "interesting", "relevance": "relevant" } def extract_simple_summary(text, title, link_type): """Fallback: extract a simple summary from text""" # Get first non-empty sentence/paragraph sentences = re.split(r'[.!?]', text) for sent in sentences: sent = sent.strip() if len(sent) > 20 and len(sent) < 300: return sent[:200] return title def get_tag_from_type(link_type): """Get tag based on link type""" tags = { "GitHub": "project", "YouTube": "video", "Reddit": "discussion", "Medium": "article", "Dev.to": "article", "arXiv": "learning", "Twitter/X": "discussion" } return tags.get(link_type, "to-read") # Send to Tududi inbox def add_to_tududi(title, url, link_type, summary="", tag=""): """Add to Tududi inbox with intelligent summary""" logger.debug(f" 📌 Adding to Tududi: {title}") try: if not TUDUDI_API_KEY: logger.warning(" TUDUDI_API_KEY not set") return False # Format the inbox content content = f"📌 **{link_type}**: {title}\n🔗 {url}" if summary: content += f"\n\n💡 **Summary**:\n{summary}" if tag: content += f"\n\n🏷️ **Tag**: {tag}" response = requests.post( f"{TUDUDI_API_URL}/inbox", headers={ "Authorization": f"Bearer {TUDUDI_API_KEY}", "Content-Type": "application/json" }, json={"content": content}, timeout=5 ) if response.status_code in [200, 201]: # 200 or 201 are both OK logger.info(f" ✓ Added to Tududi inbox with tag: {tag}") return True else: logger.warning(f" Tududi error: {response.status_code}") return False except Exception as e: logger.error(f" Tududi error: {e}") return False # Discord bot intents = discord.Intents.default() intents.message_content = True class LinkAnalyzerBot(discord.Client): async def on_ready(self): logger.info(f"✅ Bot logged in as {self.user}") logger.info(f"📍 Watching channel #remora ({CHANNEL_ID})") async def on_message(self, message): # Ignore bot's own messages if message.author == self.user: return # Only process #remora channel if message.channel.id != CHANNEL_ID: return # Check for URLs urls = extract_urls(message.content) if not urls: logger.debug(f"No URLs in message from {message.author}") return # Skip if already processed tracker = load_tracker() if message.id in tracker["processed_message_ids"]: logger.debug(f"Skipping already-processed message {message.id}") return logger.info(f"🔗 New link(s) from {message.author}: {message.content}") # Process each URL for url in urls: try: logger.info(f"Processing: {url}") link_type = detect_link_type(url) # Fetch content fetch_result = fetch_url_content(url) title = fetch_result["title"] # Analyze content if fetch was successful analysis_data = None logger.debug(f" 📊 Fetch status: {fetch_result['status']}") if fetch_result["status"] == "ok": logger.debug(f" 🔍 Starting analysis...") analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type) logger.debug(f" Analysis result: {analysis_data}") else: logger.debug(f" ⚠️ Fetch failed, skipping analysis") # Prepare summary for Tududi summary_text = "" tag = "interesting" if analysis_data: summary_text = analysis_data.get("summary", "") tag = analysis_data.get("tag", "interesting") logger.debug(f" ✓ Got summary: {summary_text[:80]}") else: logger.warning(f" ❌ No analysis data returned") # Add to Tududi with summary tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag) # Format response for Discord response_text = f"📌 **{link_type}**: {title}" if summary_text: response_text += f"\n\n💡 {summary_text}" if tag: response_text += f"\n\n🏷️ Tag: `{tag}`" logger.debug(f"Posting response: {response_text}") # Post in channel await message.reply(response_text, mention_author=False) # Update tracker tracker["links"].append({ "url": url, "title": title, "type": link_type, "author": str(message.author), "message_id": message.id, "date": datetime.now().isoformat(), "analysis": analysis_data, "tududi": tududi_ok, "fetch_status": fetch_result["status"] }) logger.info(f"✓ Processed: {url}") except Exception as e: logger.error(f"❌ Error processing {url}: {e}") await message.reply(f"❌ Error analyzing link: {e}", mention_author=False) # Update processed IDs tracker["processed_message_ids"].append(message.id) save_tracker(tracker) logger.info(f"Updated tracker, total links: {len(tracker['links'])}") # Main if __name__ == "__main__": token = os.getenv("DISCORD_BOT_TOKEN") if not token: logger.error("❌ DISCORD_BOT_TOKEN not set!") exit(1) logger.info("Starting bot...") bot = LinkAnalyzerBot(intents=intents) bot.run(token)