Simplify: Send raw content directly to Haiku (let it handle parsing)

2026-02-09 19:32:25 +01:00
parent 76f564ae83
commit e4984d607d
1 changed files with 57 additions and 256 deletions
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI
-Posts summaries, adds to Tududi inbox, maintains JSON history + logs
+Discord bot for #remora channel - analyzes links in real-time with Haiku
+Fetches content, sends to gateway for AI analysis, adds to Tududi inbox
 """

 import discord
@@ -88,9 +88,9 @@ def detect_link_type(url):
    else:
        return "Article"

-# Fetch URL content using requests
+# Fetch URL content
 def fetch_url_content(url):
-    """Fetch URL and return title + excerpt"""
+    """Fetch URL and return content"""
    logger.debug(f"  📥 Fetching: {url}")
    
    try:
@@ -104,229 +104,78 @@ def fetch_url_content(url):
            allow_redirects=True
        )
        response.raise_for_status()
-        content = response.text[:4000]  # First 4k chars
+        content = response.text[:5000]  # First 5k chars
        
-        # Try multiple patterns for title
+        # Try to find title
        title = None
-        
-        # Pattern 1: <title> tag
        title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
        if title_match:
            title = title_match.group(1).strip()
        
-        # Pattern 2: og:title meta tag (for GitHub, etc.)
        if not title:
            og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
            if og_match:
                title = og_match.group(1).strip()
        
-        # Pattern 3: h1 tag (for GitHub README)
-        if not title:
-            h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
-            if h1_match:
-                title = h1_match.group(1).strip()
-        
-        # Fallback
        if not title:
            title = url.split('/')[-1] or "Untitled"
        
-        # Extract meta description
-        desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
-        description = desc_match.group(1) if desc_match else ""
-        
-        # Extract og:description
-        if not description:
-            og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
-            if og_desc:
-                description = og_desc.group(1)
-        
        logger.debug(f"    ✓ Fetched: {title}")
        return {
            "title": title,
-            "description": description,
            "content": content,
            "status": "ok"
        }
    except requests.Timeout:
        logger.warning(f"    ⏱️ Timeout: {url}")
        return {"title": "Request timeout", "status": "timeout", "content": ""}
-    except requests.HTTPError as e:
-        logger.warning(f"    ❌ HTTP {e.response.status_code}: {url}")
-        return {"title": f"HTTP {e.response.status_code}", "status": "http_error", "content": ""}
    except Exception as e:
        logger.error(f"    ❌ Error: {e}")
-        return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}
+        return {"title": "Fetch failed", "status": "error", "content": ""}

-# Get GitHub repo info from API
-def get_github_content(url):
-    """Fetch GitHub repo README via API"""
-    logger.debug(f"    Fetching GitHub repo info from API")
-    
-    match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
-    if not match:
-        return None
-    
-    owner, repo = match.groups()[:2]
-    repo = repo.rstrip('/')
-    
-    try:
-        # Try to get README as raw markdown
-        response = requests.get(
-            f"https://api.github.com/repos/{owner}/{repo}/readme",
-            headers={"Accept": "application/vnd.github.v3.raw"},
-            timeout=5
-        )
-        
-        if response.status_code == 200:
-            content = response.text
-            # Clean markdown: remove images, code blocks, links
-            content = re.sub(r'!\[.*?\]\(.*?\)', '', content)  # Remove images
-            content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)  # Remove code blocks
-            content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)  # Convert markdown links to text
-            content = re.sub(r'#{1,6}\s+', '', content)  # Remove headers
-            content = re.sub(r'[*_-]{3,}', '', content)  # Remove horizontal rules
-            content = re.sub(r'\s+', ' ', content).strip()  # Clean whitespace
-            
-            if len(content) > 50:
-                logger.debug(f"    Got README: {len(content)} chars after cleaning")
-                return content[:2000]
-        
-        # Fallback: get repo info JSON
-        response = requests.get(
-            f"https://api.github.com/repos/{owner}/{repo}",
-            timeout=5
-        )
-        
-        if response.status_code == 200:
-            data = response.json()
-            # Collect useful info
-            parts = []
-            if data.get("description"):
-                parts.append(data["description"])
-            if data.get("topics"):
-                parts.append(f"Topics: {', '.join(data['topics'][:3])}")
-            if data.get("language"):
-                parts.append(f"Language: {data['language']}")
-            
-            content = " ".join(parts)
-            if content:
-                logger.debug(f"    Got repo info: {len(content)} chars")
-                return content
-    
-    except Exception as e:
-        logger.warning(f"    GitHub API error: {e}")
-    
-    return None
-
-# Extract clean text from HTML
-def extract_text_from_html(html):
-    """Extract readable text from HTML"""
-    # Remove DOCTYPE, comments
-    text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
-    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
-    
-    # Remove scripts, styles, noscript
-    text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
-    
-    # Try to extract main content areas first
-    main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
-    if main_match:
-        text = main_match.group(0)
-    
-    # Remove common nav/footer/sidebar patterns
-    text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
-    
-    # Remove remaining HTML tags but keep text
-    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
-    text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
-    text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
-    text = re.sub(r'</?[^>]+>', ' ', text)
-    
-    # Decode HTML entities
-    text = text.replace('&nbsp;', ' ')
-    text = text.replace('&lt;', '<')
-    text = text.replace('&gt;', '>')
-    text = text.replace('&amp;', '&')
-    
-    # Clean up whitespace
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    
-    # Return first meaningful chunk
-    if len(text) < 100:
-        return None
-    
-    return text[:2000]  # First 2000 chars of clean text
-
-# Analyze content with Haiku via gateway
+# Analyze with Haiku via gateway
 def analyze_content(url, title, content, link_type):
-    """Analyze content with AI to create intelligent summary"""
-    logger.debug(f"  🤖 Analyzing content: {url}")
+    """Send raw content to Haiku for analysis"""
+    logger.debug(f"  🤖 Analyzing with Haiku: {url}")
    
    try:
-        # Special handling for GitHub
-        clean_text = None
-        if link_type == "GitHub":
-            clean_text = get_github_content(url)
+        # Simple prompt - send raw content
+        prompt = f"""Analyze this webpage content quickly.

-        # Fallback: extract from HTML
-        if not clean_text:
-            clean_text = extract_text_from_html(content)
-        
-        if not clean_text:
-            logger.warning(f"    Could not extract content")
-            return {
-                "summary": f"GitHub project: {title}",
-                "tag": "project",
-                "relevance": "relevant"
-            }
-        
-        logger.debug(f"    Extracted {len(clean_text)} chars of content")
-        
-        # Build analysis prompt
-        prompt = f"""Analyze this webpage and create a brief summary for Laurent.
-
-**Title**: {title}
 **URL**: {url}
-**Link Type**: {link_type}
+**Title**: {title}

-**Content** (first 1500 chars):
-{clean_text[:1500]}
+**RAW PAGE CONTENT** (first 3000 chars):
+{content[:3000]}

 ---

-Create a 2-3 sentence summary that answers:
-1. What is this page about?
-2. Why would Laurent find it useful?
+Write a 2-3 sentence summary explaining:
+1. What is this about?
+2. Why would this be useful?

-Keep it practical and concise. Do NOT include the URL or title in the summary.
-"""
+Be concise. Skip marketing. No URLs or titles in summary."""
        
-        # Call gateway with a simple POST
-        logger.debug(f"    Sending to gateway for analysis...")
+        logger.debug(f"    Sending to Haiku...")
        response = requests.post(
-            "http://127.0.0.1:18789/sessions/turn",
+            f"{GATEWAY_URL}/sessions/turn",
            json={
                "message": prompt,
                "session": "main"
            },
-            timeout=15,
+            timeout=20,
            headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {}
        )
        
        if response.status_code == 200:
            result = response.json()
-            # Extract the summary from response
-            summary = result.get("message", "") or result.get("content", "")
-            if isinstance(summary, list):
-                summary = summary[0].get("text", "") if summary else ""
-            summary = summary.strip()[:300]
+            summary = result.get("message", "")
+            if isinstance(summary, list) and summary:
+                summary = summary[0].get("text", "") if isinstance(summary[0], dict) else summary[0]
+            summary = str(summary).strip()[:300]
            
-            logger.info(f"    ✓ Got summary from gateway: {summary[:60]}")
+            logger.info(f"    ✓ Got analysis: {summary[:50]}")
            
-            # Determine tag from link type
            tag = "to-read"
            if link_type == "GitHub":
                tag = "project"
@@ -341,61 +190,19 @@ Keep it practical and concise. Do NOT include the URL or title in the summary.
            
            return {
                "summary": summary,
-                "tag": tag,
-                "relevance": "relevant"
+                "tag": tag
            }
        else:
-            logger.warning(f"    Gateway error {response.status_code}, falling back to heuristic")
-            # Fallback: use simple heuristic
-            return {
-                "summary": extract_simple_summary(clean_text, title, link_type),
-                "tag": get_tag_from_type(link_type),
-                "relevance": "relevant"
-            }
+            logger.warning(f"    Gateway error: {response.status_code}")
+            return None
        
-    except requests.Timeout:
-        logger.warning(f"    Gateway timeout, using fallback")
-        return {
-            "summary": extract_simple_summary(content, title, link_type),
-            "tag": get_tag_from_type(link_type),
-            "relevance": "relevant"
-        }
    except Exception as e:
        logger.error(f"    Analysis error: {e}")
-        import traceback
-        logger.error(traceback.format_exc())
-        return {
-            "summary": title,
-            "tag": "interesting",
-            "relevance": "relevant"
-        }
-
-def extract_simple_summary(text, title, link_type):
-    """Fallback: extract a simple summary from text"""
-    # Get first non-empty sentence/paragraph
-    sentences = re.split(r'[.!?]', text)
-    for sent in sentences:
-        sent = sent.strip()
-        if len(sent) > 20 and len(sent) < 300:
-            return sent[:200]
-    return title
-
-def get_tag_from_type(link_type):
-    """Get tag based on link type"""
-    tags = {
-        "GitHub": "project",
-        "YouTube": "video",
-        "Reddit": "discussion",
-        "Medium": "article",
-        "Dev.to": "article",
-        "arXiv": "learning",
-        "Twitter/X": "discussion"
-    }
-    return tags.get(link_type, "to-read")
+        return None

 # Send to Tududi inbox
 def add_to_tududi(title, url, link_type, summary="", tag=""):
-    """Add to Tududi inbox with intelligent summary"""
+    """Add to Tududi inbox"""
    logger.debug(f"  📌 Adding to Tududi: {title}")
    
    try:
@@ -403,14 +210,11 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
            logger.warning("    TUDUDI_API_KEY not set")
            return False
        
-        # Format the inbox content
        content = f"📌 **{link_type}**: {title}\n🔗 {url}"
-        
        if summary:
-            content += f"\n\n💡 **Summary**:\n{summary}"
-        
+            content += f"\n\n💡 {summary}"
        if tag:
-            content += f"\n\n🏷️ **Tag**: {tag}"
+            content += f"\n\n🏷️ {tag}"
        
        response = requests.post(
            f"{TUDUDI_API_URL}/inbox",
@@ -422,8 +226,8 @@ def add_to_tududi(title, url, link_type, summary="", tag=""):
            timeout=5
        )
        
-        if response.status_code in [200, 201]:  # 200 or 201 are both OK
-            logger.info(f"    ✓ Added to Tududi inbox with tag: {tag}")
+        if response.status_code in [200, 201]:
+            logger.info(f"    ✓ Added to Tududi")
            return True
        else:
            logger.warning(f"    Tududi error: {response.status_code}")
@@ -474,38 +278,31 @@ class LinkAnalyzerBot(discord.Client):
                fetch_result = fetch_url_content(url)
                title = fetch_result["title"]
                
-                # Analyze content if fetch was successful
+                # Analyze with Haiku
                analysis_data = None
-                logger.debug(f"  📊 Fetch status: {fetch_result['status']}")
-                
                if fetch_result["status"] == "ok":
-                    logger.debug(f"  🔍 Starting analysis...")
-                    analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type)
-                    logger.debug(f"  Analysis result: {analysis_data}")
-                else:
-                    logger.debug(f"  ⚠️ Fetch failed, skipping analysis")
+                    logger.debug(f"  Analyzing...")
+                    analysis_data = analyze_content(url, title, fetch_result["content"], link_type)
                
-                # Prepare summary for Tududi
+                # Prepare summary
                summary_text = ""
                tag = "interesting"
                if analysis_data:
                    summary_text = analysis_data.get("summary", "")
                    tag = analysis_data.get("tag", "interesting")
-                    logger.debug(f"  ✓ Got summary: {summary_text[:80]}")
-                else:
-                    logger.warning(f"  ❌ No analysis data returned")
+                    logger.debug(f"  Summary: {summary_text[:60]}")
                
-                # Add to Tududi with summary
-                tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag)
+                # Add to Tududi
+                add_to_tududi(title, url, link_type, summary_text, tag)
                
-                # Format response for Discord
+                # Format response
                response_text = f"📌 **{link_type}**: {title}"
                if summary_text:
                    response_text += f"\n\n💡 {summary_text}"
                if tag:
-                    response_text += f"\n\n🏷️ Tag: `{tag}`"
+                    response_text += f"\n\n🏷️ `{tag}`"
                
-                logger.debug(f"Posting response: {response_text}")
+                logger.debug(f"Posting response...")
                
                # Post in channel
                await message.reply(response_text, mention_author=False)
@@ -518,21 +315,25 @@ class LinkAnalyzerBot(discord.Client):
                    "author": str(message.author),
                    "message_id": message.id,
                    "date": datetime.now().isoformat(),
-                    "analysis": analysis_data,
-                    "tududi": tududi_ok,
-                    "fetch_status": fetch_result["status"]
+                    "summary": summary_text,
+                    "tag": tag
                })
                
                logger.info(f"✓ Processed: {url}")
                
            except Exception as e:
-                logger.error(f"❌ Error processing {url}: {e}")
-                await message.reply(f"❌ Error analyzing link: {e}", mention_author=False)
+                logger.error(f"❌ Error: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                try:
+                    await message.reply(f"❌ Error: {str(e)[:100]}", mention_author=False)
+                except:
+                    pass
        
-        # Update processed IDs
+        # Update tracker
        tracker["processed_message_ids"].append(message.id)
        save_tracker(tracker)
-        logger.info(f"Updated tracker, total links: {len(tracker['links'])}")
+        logger.info(f"Updated tracker: {len(tracker['links'])} links total")

 # Main
 if __name__ == "__main__":