From 03999875b52d73f865ca74d71ab5e153522098db Mon Sep 17 00:00:00 2001 From: Remora Date: Mon, 9 Feb 2026 18:46:45 +0100 Subject: [PATCH] Feat: Add web_fetch, AI analysis (Haiku), and comprehensive logging --- bot.py | 253 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 184 insertions(+), 69 deletions(-) diff --git a/bot.py b/bot.py index 0399a90..a375a32 100644 --- a/bot.py +++ b/bot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -Discord bot for #remora channel - analyzes links in real-time -Posts summaries, adds to Tududi inbox, maintains JSON history +Discord bot for #remora channel - analyzes links in real-time with web_fetch + AI +Posts summaries, adds to Tududi inbox, maintains JSON history + logs """ import discord @@ -12,17 +12,38 @@ import requests from datetime import datetime from pathlib import Path from dotenv import load_dotenv +import logging +from urllib.parse import urlparse # Load .env file load_dotenv() +# Setup logging +log_file = Path(__file__).parent / "bot.log" +logging.basicConfig( + level=logging.DEBUG, + format='[%(asctime)s] [%(levelname)-8s] %(message)s', + handlers=[ + logging.FileHandler(log_file), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + # Config CHANNEL_ID = 1467557082583535729 TRACKER_FILE = Path(__file__).parent / "tracker.json" TUDUDI_API_URL = os.getenv("TUDUDI_API_URL", "https://todo.dilain.com/api/v1") TUDUDI_API_KEY = os.getenv("TUDUDI_API_KEY") GATEWAY_URL = os.getenv("OPENCLAW_GATEWAY", "http://127.0.0.1:18789") -GATEWAY_TOKEN = os.getenv("OPENCLAW_GATEWAY_TOKEN") +GATEWAY_TOKEN = os.getenv("OPENCLAW_GATEWAY_TOKEN", "") + +logger.info("=" * 60) +logger.info("Bot startup") +logger.info(f" Channel ID: {CHANNEL_ID}") +logger.info(f" Tududi API: {TUDUDI_API_URL}") +logger.info(f" Gateway: {GATEWAY_URL}") +logger.info("=" * 60) # Load or init tracker def load_tracker(): @@ -44,56 +65,127 @@ def extract_urls(text): url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' return re.findall(url_pattern, text) -# Fetch and analyze URL -def analyze_url(url): - """Fetch URL and create summary""" +# Detect link type +def detect_link_type(url): + domain = urlparse(url).netloc.lower() + + if "github.com" in domain: + return "GitHub" + elif "reddit.com" in domain: + return "Reddit" + elif "youtube.com" in domain or "youtu.be" in domain: + return "YouTube" + elif "tiktok.com" in domain: + return "TikTok" + elif "twitter.com" in domain or "x.com" in domain: + return "Twitter/X" + elif "medium.com" in domain: + return "Medium" + elif "dev.to" in domain: + return "Dev.to" + elif "arxiv.org" in domain: + return "arXiv" + else: + return "Article" + +# Fetch URL content using requests +def fetch_url_content(url): + """Fetch URL and return title + excerpt""" + logger.debug(f" šŸ“„ Fetching: {url}") + try: - print(f" šŸ“„ Fetching: {url}") - response = requests.get(url, timeout=5, headers={ - 'User-Agent': 'Mozilla/5.0' - }) - content = response.text[:2000] # First 2k chars + response = requests.get( + url, + timeout=5, + headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}, + allow_redirects=True + ) + response.raise_for_status() + content = response.text[:3000] # First 3k chars # Extract title title_match = re.search(r']*>([^<]+)', content, re.IGNORECASE) - title = title_match.group(1).strip() if title_match else url.split('/')[-1] + title = title_match.group(1).strip() if title_match else "No title found" - # Simple content type detection - link_type = "webpage" - if "github.com" in url: - link_type = "GitHub" - elif "reddit.com" in url: - link_type = "Reddit" - elif "youtube.com" in url or "youtu.be" in url: - link_type = "YouTube" - elif "tiktok.com" in url: - link_type = "TikTok" - elif "twitter.com" in url or "x.com" in url: - link_type = "Twitter/X" + # Extract meta description + desc_match = re.search(r'