From e4984d607d4b8fcb844147715c6782c9090e9a03 Mon Sep 17 00:00:00 2001 From: Remora Date: Mon, 9 Feb 2026 19:32:25 +0100 Subject: [PATCH] Simplify: Send raw content directly to Haiku (let it handle parsing) --- bot.py | 313 +++++++++++---------------------------------------------- 1 file changed, 57 insertions(+), 256 deletions(-) diff --git a/bot.py b/bot.py index 46ca311..3ba7807 100644 --- a/bot.py +++ b/bot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI -Posts summaries, adds to Tududi inbox, maintains JSON history + logs +Discord bot for #remora channel - analyzes links in real-time with Haiku +Fetches content, sends to gateway for AI analysis, adds to Tududi inbox """ import discord @@ -88,9 +88,9 @@ def detect_link_type(url): else: return "Article" -# Fetch URL content using requests +# Fetch URL content def fetch_url_content(url): - """Fetch URL and return title + excerpt""" + """Fetch URL and return content""" logger.debug(f" šŸ“„ Fetching: {url}") try: @@ -104,229 +104,78 @@ def fetch_url_content(url): allow_redirects=True ) response.raise_for_status() - content = response.text[:4000] # First 4k chars + content = response.text[:5000] # First 5k chars - # Try multiple patterns for title + # Try to find title title = None - - # Pattern 1: tag title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*', content, re.IGNORECASE) if title_match: title = title_match.group(1).strip() - # Pattern 2: og:title meta tag (for GitHub, etc.) if not title: og_match = re.search(r']*>([^<]+)', content, re.IGNORECASE) - if h1_match: - title = h1_match.group(1).strip() - - # Fallback if not title: title = url.split('/')[-1] or "Untitled" - # Extract meta description - desc_match = re.search(r' 50: - logger.debug(f" Got README: {len(content)} chars after cleaning") - return content[:2000] - - # Fallback: get repo info JSON - response = requests.get( - f"https://api.github.com/repos/{owner}/{repo}", - timeout=5 - ) - - if response.status_code == 200: - data = response.json() - # Collect useful info - parts = [] - if data.get("description"): - parts.append(data["description"]) - if data.get("topics"): - parts.append(f"Topics: {', '.join(data['topics'][:3])}") - if data.get("language"): - parts.append(f"Language: {data['language']}") - - content = " ".join(parts) - if content: - logger.debug(f" Got repo info: {len(content)} chars") - return content - - except Exception as e: - logger.warning(f" GitHub API error: {e}") - - return None - -# Extract clean text from HTML -def extract_text_from_html(html): - """Extract readable text from HTML""" - # Remove DOCTYPE, comments - text = re.sub(r']*>', '', html, flags=re.IGNORECASE) - text = re.sub(r'', '', text, flags=re.DOTALL) - - # Remove scripts, styles, noscript - text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) - text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) - text = re.sub(r']*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) - - # Try to extract main content areas first - main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?', text, flags=re.DOTALL | re.IGNORECASE) - if main_match: - text = main_match.group(0) - - # Remove common nav/footer/sidebar patterns - text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?', ' ', text, flags=re.DOTALL | re.IGNORECASE) - - # Remove remaining HTML tags but keep text - text = re.sub(r'', '\n', text, flags=re.IGNORECASE) - text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) - text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) - text = re.sub(r']+>', ' ', text) - - # Decode HTML entities - text = text.replace(' ', ' ') - text = text.replace('<', '<') - text = text.replace('>', '>') - text = text.replace('&', '&') - - # Clean up whitespace - text = re.sub(r'\s+', ' ', text) - text = text.strip() - - # Return first meaningful chunk - if len(text) < 100: - return None - - return text[:2000] # First 2000 chars of clean text - -# Analyze content with Haiku via gateway +# Analyze with Haiku via gateway def analyze_content(url, title, content, link_type): - """Analyze content with AI to create intelligent summary""" - logger.debug(f" šŸ¤– Analyzing content: {url}") + """Send raw content to Haiku for analysis""" + logger.debug(f" šŸ¤– Analyzing with Haiku: {url}") try: - # Special handling for GitHub - clean_text = None - if link_type == "GitHub": - clean_text = get_github_content(url) - - # Fallback: extract from HTML - if not clean_text: - clean_text = extract_text_from_html(content) - - if not clean_text: - logger.warning(f" Could not extract content") - return { - "summary": f"GitHub project: {title}", - "tag": "project", - "relevance": "relevant" - } - - logger.debug(f" Extracted {len(clean_text)} chars of content") - - # Build analysis prompt - prompt = f"""Analyze this webpage and create a brief summary for Laurent. + # Simple prompt - send raw content + prompt = f"""Analyze this webpage content quickly. -**Title**: {title} **URL**: {url} -**Link Type**: {link_type} +**Title**: {title} -**Content** (first 1500 chars): -{clean_text[:1500]} +**RAW PAGE CONTENT** (first 3000 chars): +{content[:3000]} --- -Create a 2-3 sentence summary that answers: -1. What is this page about? -2. Why would Laurent find it useful? +Write a 2-3 sentence summary explaining: +1. What is this about? +2. Why would this be useful? -Keep it practical and concise. Do NOT include the URL or title in the summary. -""" +Be concise. Skip marketing. No URLs or titles in summary.""" - # Call gateway with a simple POST - logger.debug(f" Sending to gateway for analysis...") + logger.debug(f" Sending to Haiku...") response = requests.post( - "http://127.0.0.1:18789/sessions/turn", + f"{GATEWAY_URL}/sessions/turn", json={ "message": prompt, "session": "main" }, - timeout=15, + timeout=20, headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {} ) if response.status_code == 200: result = response.json() - # Extract the summary from response - summary = result.get("message", "") or result.get("content", "") - if isinstance(summary, list): - summary = summary[0].get("text", "") if summary else "" - summary = summary.strip()[:300] + summary = result.get("message", "") + if isinstance(summary, list) and summary: + summary = summary[0].get("text", "") if isinstance(summary[0], dict) else summary[0] + summary = str(summary).strip()[:300] - logger.info(f" āœ“ Got summary from gateway: {summary[:60]}") + logger.info(f" āœ“ Got analysis: {summary[:50]}") - # Determine tag from link type tag = "to-read" if link_type == "GitHub": tag = "project" @@ -341,61 +190,19 @@ Keep it practical and concise. Do NOT include the URL or title in the summary. return { "summary": summary, - "tag": tag, - "relevance": "relevant" + "tag": tag } else: - logger.warning(f" Gateway error {response.status_code}, falling back to heuristic") - # Fallback: use simple heuristic - return { - "summary": extract_simple_summary(clean_text, title, link_type), - "tag": get_tag_from_type(link_type), - "relevance": "relevant" - } + logger.warning(f" Gateway error: {response.status_code}") + return None - except requests.Timeout: - logger.warning(f" Gateway timeout, using fallback") - return { - "summary": extract_simple_summary(content, title, link_type), - "tag": get_tag_from_type(link_type), - "relevance": "relevant" - } except Exception as e: logger.error(f" Analysis error: {e}") - import traceback - logger.error(traceback.format_exc()) - return { - "summary": title, - "tag": "interesting", - "relevance": "relevant" - } - -def extract_simple_summary(text, title, link_type): - """Fallback: extract a simple summary from text""" - # Get first non-empty sentence/paragraph - sentences = re.split(r'[.!?]', text) - for sent in sentences: - sent = sent.strip() - if len(sent) > 20 and len(sent) < 300: - return sent[:200] - return title - -def get_tag_from_type(link_type): - """Get tag based on link type""" - tags = { - "GitHub": "project", - "YouTube": "video", - "Reddit": "discussion", - "Medium": "article", - "Dev.to": "article", - "arXiv": "learning", - "Twitter/X": "discussion" - } - return tags.get(link_type, "to-read") + return None # Send to Tududi inbox def add_to_tududi(title, url, link_type, summary="", tag=""): - """Add to Tududi inbox with intelligent summary""" + """Add to Tududi inbox""" logger.debug(f" šŸ“Œ Adding to Tududi: {title}") try: @@ -403,14 +210,11 @@ def add_to_tududi(title, url, link_type, summary="", tag=""): logger.warning(" TUDUDI_API_KEY not set") return False - # Format the inbox content content = f"šŸ“Œ **{link_type}**: {title}\nšŸ”— {url}" - if summary: - content += f"\n\nšŸ’” **Summary**:\n{summary}" - + content += f"\n\nšŸ’” {summary}" if tag: - content += f"\n\nšŸ·ļø **Tag**: {tag}" + content += f"\n\nšŸ·ļø {tag}" response = requests.post( f"{TUDUDI_API_URL}/inbox", @@ -422,8 +226,8 @@ def add_to_tududi(title, url, link_type, summary="", tag=""): timeout=5 ) - if response.status_code in [200, 201]: # 200 or 201 are both OK - logger.info(f" āœ“ Added to Tududi inbox with tag: {tag}") + if response.status_code in [200, 201]: + logger.info(f" āœ“ Added to Tududi") return True else: logger.warning(f" Tududi error: {response.status_code}") @@ -474,38 +278,31 @@ class LinkAnalyzerBot(discord.Client): fetch_result = fetch_url_content(url) title = fetch_result["title"] - # Analyze content if fetch was successful + # Analyze with Haiku analysis_data = None - logger.debug(f" šŸ“Š Fetch status: {fetch_result['status']}") - if fetch_result["status"] == "ok": - logger.debug(f" šŸ” Starting analysis...") - analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type) - logger.debug(f" Analysis result: {analysis_data}") - else: - logger.debug(f" āš ļø Fetch failed, skipping analysis") + logger.debug(f" Analyzing...") + analysis_data = analyze_content(url, title, fetch_result["content"], link_type) - # Prepare summary for Tududi + # Prepare summary summary_text = "" tag = "interesting" if analysis_data: summary_text = analysis_data.get("summary", "") tag = analysis_data.get("tag", "interesting") - logger.debug(f" āœ“ Got summary: {summary_text[:80]}") - else: - logger.warning(f" āŒ No analysis data returned") + logger.debug(f" Summary: {summary_text[:60]}") - # Add to Tududi with summary - tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag) + # Add to Tududi + add_to_tududi(title, url, link_type, summary_text, tag) - # Format response for Discord + # Format response response_text = f"šŸ“Œ **{link_type}**: {title}" if summary_text: response_text += f"\n\nšŸ’” {summary_text}" if tag: - response_text += f"\n\nšŸ·ļø Tag: `{tag}`" + response_text += f"\n\nšŸ·ļø `{tag}`" - logger.debug(f"Posting response: {response_text}") + logger.debug(f"Posting response...") # Post in channel await message.reply(response_text, mention_author=False) @@ -518,21 +315,25 @@ class LinkAnalyzerBot(discord.Client): "author": str(message.author), "message_id": message.id, "date": datetime.now().isoformat(), - "analysis": analysis_data, - "tududi": tududi_ok, - "fetch_status": fetch_result["status"] + "summary": summary_text, + "tag": tag }) logger.info(f"āœ“ Processed: {url}") except Exception as e: - logger.error(f"āŒ Error processing {url}: {e}") - await message.reply(f"āŒ Error analyzing link: {e}", mention_author=False) + logger.error(f"āŒ Error: {e}") + import traceback + logger.error(traceback.format_exc()) + try: + await message.reply(f"āŒ Error: {str(e)[:100]}", mention_author=False) + except: + pass - # Update processed IDs + # Update tracker tracker["processed_message_ids"].append(message.id) save_tracker(tracker) - logger.info(f"Updated tracker, total links: {len(tracker['links'])}") + logger.info(f"Updated tracker: {len(tracker['links'])} links total") # Main if __name__ == "__main__":