link-analyzer/bot.py

#!/usr/bin/env python3
"""
Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI
Posts summaries, adds to Tududi inbox, maintains JSON history + logs
"""

import discord
import os
import json
import re
import requests
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
import logging
from urllib.parse import urlparse

# Load .env file
load_dotenv()

# Setup logging
log_file = Path(__file__).parent / "bot.log"
logging.basicConfig(
    level=logging.DEBUG,
    format='[%(asctime)s] [%(levelname)-8s] %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Config
CHANNEL_ID = 1467557082583535729
TRACKER_FILE = Path(__file__).parent / "tracker.json"
TUDUDI_API_URL = os.getenv("TUDUDI_API_URL", "https://todo.dilain.com/api/v1")
TUDUDI_API_KEY = os.getenv("TUDUDI_API_KEY")
GATEWAY_URL = os.getenv("OPENCLAW_GATEWAY", "http://127.0.0.1:18789")
GATEWAY_TOKEN = os.getenv("OPENCLAW_GATEWAY_TOKEN", "")

logger.info("=" * 60)
logger.info("Bot startup")
logger.info(f"  Channel ID: {CHANNEL_ID}")
logger.info(f"  Tududi API: {TUDUDI_API_URL}")
logger.info(f"  Gateway: {GATEWAY_URL}")
logger.info("=" * 60)

# Load or init tracker
def load_tracker():
    if TRACKER_FILE.exists():
        with open(TRACKER_FILE) as f:
            return json.load(f)
    return {
        "channel_id": CHANNEL_ID,
        "processed_message_ids": [],
        "links": []
    }

def save_tracker(data):
    with open(TRACKER_FILE, "w") as f:
        json.dump(data, f, indent=2)

# Detect links in text
def extract_urls(text):
    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
    return re.findall(url_pattern, text)

# Detect link type
def detect_link_type(url):
    domain = urlparse(url).netloc.lower()

    if "github.com" in domain:
        return "GitHub"
    elif "reddit.com" in domain:
        return "Reddit"
    elif "youtube.com" in domain or "youtu.be" in domain:
        return "YouTube"
    elif "tiktok.com" in domain:
        return "TikTok"
    elif "twitter.com" in domain or "x.com" in domain:
        return "Twitter/X"
    elif "medium.com" in domain:
        return "Medium"
    elif "dev.to" in domain:
        return "Dev.to"
    elif "arxiv.org" in domain:
        return "arXiv"
    else:
        return "Article"

# Fetch URL content using requests
def fetch_url_content(url):
    """Fetch URL and return title + excerpt"""
    logger.debug(f"  📥 Fetching: {url}")

    try:
        response = requests.get(
            url,
            timeout=8,
            headers={
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
                'Accept': 'text/html,application/xhtml+xml'
            },
            allow_redirects=True
        )
        response.raise_for_status()
        content = response.text[:4000]  # First 4k chars

        # Try multiple patterns for title
        title = None

        # Pattern 1: <title> tag
        title_match = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', content, re.IGNORECASE)
        if title_match:
            title = title_match.group(1).strip()

        # Pattern 2: og:title meta tag (for GitHub, etc.)
        if not title:
            og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', content, re.IGNORECASE)
            if og_match:
                title = og_match.group(1).strip()

        # Pattern 3: h1 tag (for GitHub README)
        if not title:
            h1_match = re.search(r'<h1[^>]*>([^<]+)</h1>', content, re.IGNORECASE)
            if h1_match:
                title = h1_match.group(1).strip()

        # Fallback
        if not title:
            title = url.split('/')[-1] or "Untitled"

        # Extract meta description
        desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
        description = desc_match.group(1) if desc_match else ""

        # Extract og:description
        if not description:
            og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
            if og_desc:
                description = og_desc.group(1)

        logger.debug(f"    ✓ Fetched: {title}")
        return {
            "title": title,
            "description": description,
            "content": content,
            "status": "ok"
        }
    except requests.Timeout:
        logger.warning(f"    ⏱️ Timeout: {url}")
        return {"title": "Request timeout", "status": "timeout", "content": ""}
    except requests.HTTPError as e:
        logger.warning(f"    ❌ HTTP {e.response.status_code}: {url}")
        return {"title": f"HTTP {e.response.status_code}", "status": "http_error", "content": ""}
    except Exception as e:
        logger.error(f"    ❌ Error: {e}")
        return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}

# Get GitHub repo info from API
def get_github_content(url):
    """Fetch GitHub repo README via API"""
    logger.debug(f"    Fetching GitHub repo info from API")

    match = re.search(r'github\.com/([^/]+)/([^/]+?)(/|$)', url)
    if not match:
        return None

    owner, repo = match.groups()[:2]
    repo = repo.rstrip('/')

    try:
        # Try to get README as raw markdown
        response = requests.get(
            f"https://api.github.com/repos/{owner}/{repo}/readme",
            headers={"Accept": "application/vnd.github.v3.raw"},
            timeout=5
        )

        if response.status_code == 200:
            content = response.text
            # Clean markdown: remove images, code blocks, links
            content = re.sub(r'!\[.*?\]\(.*?\)', '', content)  # Remove images
            content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)  # Remove code blocks
            content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)  # Convert markdown links to text
            content = re.sub(r'#{1,6}\s+', '', content)  # Remove headers
            content = re.sub(r'[*_-]{3,}', '', content)  # Remove horizontal rules
            content = re.sub(r'\s+', ' ', content).strip()  # Clean whitespace

            if len(content) > 50:
                logger.debug(f"    Got README: {len(content)} chars after cleaning")
                return content[:2000]

        # Fallback: get repo info JSON
        response = requests.get(
            f"https://api.github.com/repos/{owner}/{repo}",
            timeout=5
        )

        if response.status_code == 200:
            data = response.json()
            # Collect useful info
            parts = []
            if data.get("description"):
                parts.append(data["description"])
            if data.get("topics"):
                parts.append(f"Topics: {', '.join(data['topics'][:3])}")
            if data.get("language"):
                parts.append(f"Language: {data['language']}")

            content = " ".join(parts)
            if content:
                logger.debug(f"    Got repo info: {len(content)} chars")
                return content

    except Exception as e:
        logger.warning(f"    GitHub API error: {e}")

    return None

# Extract clean text from HTML
def extract_text_from_html(html):
    """Extract readable text from HTML"""
    # Remove DOCTYPE, comments
    text = re.sub(r'<!DOCTYPE[^>]*>', '', html, flags=re.IGNORECASE)
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # Remove scripts, styles, noscript
    text = re.sub(r'<script[^>]*>.*?</script>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<noscript[^>]*>.*?</noscript>', ' ', text, flags=re.DOTALL | re.IGNORECASE)

    # Try to extract main content areas first
    main_match = re.search(r'<(main|article|div[^>]*class="[^"]*content[^"]*"[^>]*)>.*?</\1>', text, flags=re.DOTALL | re.IGNORECASE)
    if main_match:
        text = main_match.group(0)

    # Remove common nav/footer/sidebar patterns
    text = re.sub(r'<(nav|footer|aside|header)[^>]*>.*?</\1>', ' ', text, flags=re.DOTALL | re.IGNORECASE)

    # Remove remaining HTML tags but keep text
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<p[^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<h[1-6][^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</?[^>]+>', ' ', text)

    # Decode HTML entities
    text = text.replace('&nbsp;', ' ')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('&amp;', '&')

    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    # Return first meaningful chunk
    if len(text) < 100:
        return None

    return text[:2000]  # First 2000 chars of clean text

# Analyze content with Haiku via gateway
def analyze_content(url, title, content, link_type):
    """Analyze content with AI to create intelligent summary"""
    logger.debug(f"  🤖 Analyzing content: {url}")

    try:
        # Special handling for GitHub
        clean_text = None
        if link_type == "GitHub":
            clean_text = get_github_content(url)

        # Fallback: extract from HTML
        if not clean_text:
            clean_text = extract_text_from_html(content)

        if not clean_text:
            logger.warning(f"    Could not extract content")
            return {
                "summary": f"GitHub project: {title}",
                "tag": "project",
                "relevance": "relevant"
            }

        logger.debug(f"    Extracted {len(clean_text)} chars of content")

        # Build analysis prompt
        prompt = f"""Analyze this webpage and create a brief summary for Laurent.

**Title**: {title}
**URL**: {url}
**Link Type**: {link_type}

**Content** (first 1500 chars):
{clean_text[:1500]}

---

Create a 2-3 sentence summary that answers:
1. What is this page about?
2. Why would Laurent find it useful?

Keep it practical and concise. Do NOT include the URL or title in the summary.
"""

        # Call gateway with a simple POST
        logger.debug(f"    Sending to gateway for analysis...")
        response = requests.post(
            "http://127.0.0.1:18789/sessions/turn",
            json={
                "message": prompt,
                "session": "main"
            },
            timeout=15,
            headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {}
        )

        if response.status_code == 200:
            result = response.json()
            # Extract the summary from response
            summary = result.get("message", "") or result.get("content", "")
            if isinstance(summary, list):
                summary = summary[0].get("text", "") if summary else ""
            summary = summary.strip()[:300]

            logger.info(f"    ✓ Got summary from gateway: {summary[:60]}")

            # Determine tag from link type
            tag = "to-read"
            if link_type == "GitHub":
                tag = "project"
            elif link_type == "YouTube":
                tag = "video"
            elif link_type == "Reddit":
                tag = "discussion"
            elif link_type in ["Medium", "Dev.to"]:
                tag = "article"
            elif link_type == "arXiv":
                tag = "learning"

            return {
                "summary": summary,
                "tag": tag,
                "relevance": "relevant"
            }
        else:
            logger.warning(f"    Gateway error {response.status_code}, falling back to heuristic")
            # Fallback: use simple heuristic
            return {
                "summary": extract_simple_summary(clean_text, title, link_type),
                "tag": get_tag_from_type(link_type),
                "relevance": "relevant"
            }

    except requests.Timeout:
        logger.warning(f"    Gateway timeout, using fallback")
        return {
            "summary": extract_simple_summary(content, title, link_type),
            "tag": get_tag_from_type(link_type),
            "relevance": "relevant"
        }
    except Exception as e:
        logger.error(f"    Analysis error: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return {
            "summary": title,
            "tag": "interesting",
            "relevance": "relevant"
        }

def extract_simple_summary(text, title, link_type):
    """Fallback: extract a simple summary from text"""
    # Get first non-empty sentence/paragraph
    sentences = re.split(r'[.!?]', text)
    for sent in sentences:
        sent = sent.strip()
        if len(sent) > 20 and len(sent) < 300:
            return sent[:200]
    return title

def get_tag_from_type(link_type):
    """Get tag based on link type"""
    tags = {
        "GitHub": "project",
        "YouTube": "video",
        "Reddit": "discussion",
        "Medium": "article",
        "Dev.to": "article",
        "arXiv": "learning",
        "Twitter/X": "discussion"
    }
    return tags.get(link_type, "to-read")

# Send to Tududi inbox
def add_to_tududi(title, url, link_type, summary="", tag=""):
    """Add to Tududi inbox with intelligent summary"""
    logger.debug(f"  📌 Adding to Tududi: {title}")

    try:
        if not TUDUDI_API_KEY:
            logger.warning("    TUDUDI_API_KEY not set")
            return False

        # Format the inbox content
        content = f"📌 **{link_type}**: {title}\n🔗 {url}"

        if summary:
            content += f"\n\n💡 **Summary**:\n{summary}"

        if tag:
            content += f"\n\n🏷️ **Tag**: {tag}"

        response = requests.post(
            f"{TUDUDI_API_URL}/inbox",
            headers={
                "Authorization": f"Bearer {TUDUDI_API_KEY}",
                "Content-Type": "application/json"
            },
            json={"content": content},
            timeout=5
        )

        if response.status_code in [200, 201]:  # 200 or 201 are both OK
            logger.info(f"    ✓ Added to Tududi inbox with tag: {tag}")
            return True
        else:
            logger.warning(f"    Tududi error: {response.status_code}")
            return False
    except Exception as e:
        logger.error(f"    Tududi error: {e}")
        return False

# Discord bot
intents = discord.Intents.default()
intents.message_content = True

class LinkAnalyzerBot(discord.Client):
    async def on_ready(self):
        logger.info(f"✅ Bot logged in as {self.user}")
        logger.info(f"📍 Watching channel #remora ({CHANNEL_ID})")

    async def on_message(self, message):
        # Ignore bot's own messages
        if message.author == self.user:
            return

        # Only process #remora channel
        if message.channel.id != CHANNEL_ID:
            return

        # Check for URLs
        urls = extract_urls(message.content)
        if not urls:
            logger.debug(f"No URLs in message from {message.author}")
            return

        # Skip if already processed
        tracker = load_tracker()
        if message.id in tracker["processed_message_ids"]:
            logger.debug(f"Skipping already-processed message {message.id}")
            return

        logger.info(f"🔗 New link(s) from {message.author}: {message.content}")

        # Process each URL
        for url in urls:
            try:
                logger.info(f"Processing: {url}")
                link_type = detect_link_type(url)

                # Fetch content
                fetch_result = fetch_url_content(url)
                title = fetch_result["title"]

                # Analyze content if fetch was successful
                analysis_data = None
                logger.debug(f"  📊 Fetch status: {fetch_result['status']}")

                if fetch_result["status"] == "ok":
                    logger.debug(f"  🔍 Starting analysis...")
                    analysis_data = analyze_content(url, title, fetch_result.get("content", ""), link_type)
                    logger.debug(f"  Analysis result: {analysis_data}")
                else:
                    logger.debug(f"  ⚠️ Fetch failed, skipping analysis")

                # Prepare summary for Tududi
                summary_text = ""
                tag = "interesting"
                if analysis_data:
                    summary_text = analysis_data.get("summary", "")
                    tag = analysis_data.get("tag", "interesting")
                    logger.debug(f"  ✓ Got summary: {summary_text[:80]}")
                else:
                    logger.warning(f"  ❌ No analysis data returned")

                # Add to Tududi with summary
                tududi_ok = add_to_tududi(title, url, link_type, summary_text, tag)

                # Format response for Discord
                response_text = f"📌 **{link_type}**: {title}"
                if summary_text:
                    response_text += f"\n\n💡 {summary_text}"
                if tag:
                    response_text += f"\n\n🏷️ Tag: `{tag}`"

                logger.debug(f"Posting response: {response_text}")

                # Post in channel
                await message.reply(response_text, mention_author=False)

                # Update tracker
                tracker["links"].append({
                    "url": url,
                    "title": title,
                    "type": link_type,
                    "author": str(message.author),
                    "message_id": message.id,
                    "date": datetime.now().isoformat(),
                    "analysis": analysis_data,
                    "tududi": tududi_ok,
                    "fetch_status": fetch_result["status"]
                })

                logger.info(f"✓ Processed: {url}")

            except Exception as e:
                logger.error(f"❌ Error processing {url}: {e}")
                await message.reply(f"❌ Error analyzing link: {e}", mention_author=False)

        # Update processed IDs
        tracker["processed_message_ids"].append(message.id)
        save_tracker(tracker)
        logger.info(f"Updated tracker, total links: {len(tracker['links'])}")

# Main
if __name__ == "__main__":
    token = os.getenv("DISCORD_BOT_TOKEN")
    if not token:
        logger.error("❌ DISCORD_BOT_TOKEN not set!")
        exit(1)

    logger.info("Starting bot...")
    bot = LinkAnalyzerBot(intents=intents)
    bot.run(token)