Feature: Real AI-powered summaries via gateway (explains content + utility)

2026-02-09 19:17:37 +01:00
parent e42e5ca563
commit d0ca96191c
1 changed files with 110 additions and 72 deletions
@@ -157,99 +157,137 @@ def fetch_url_content(url):
        logger.error(f"    ❌ Error: {e}")
        return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}

-# Analyze content (local heuristic-based)
+# Extract clean text from HTML
+def extract_text_from_html(html):
+    """Extract readable text from HTML"""
+    # Remove scripts and styles
+    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
+    # Remove HTML tags
+    text = re.sub(r'<[^>]+>', ' ', text)
+    # Clean up whitespace
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text[:2000]  # First 2000 chars of clean text
+
+# Analyze content with Haiku via gateway
 def analyze_content(url, title, content, link_type):
-    """Analyze content and suggest summary + tag locally"""
+    """Analyze content with AI to create intelligent summary"""
    logger.debug(f"  🤖 Analyzing content: {url}")
-    logger.debug(f"    Content length: {len(content)} chars")
-    logger.debug(f"    Link type: {link_type}")
    
    try:
-        # Extract useful info from HTML content
-        description = ""
+        # Extract clean text
+        clean_text = extract_text_from_html(content)
+        logger.debug(f"    Extracted {len(clean_text)} chars of clean text")
        
-        # Looking for meta description
-        desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
-        if desc_match:
-            description = desc_match.group(1).strip()
-            logger.debug(f"    Found meta description: {description[:80]}")
+        # Build analysis prompt
+        prompt = f"""Analyze this webpage and create a brief summary for Laurent.

-        # Looking for og:description
-        if not description:
-            og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
-            if og_desc:
-                description = og_desc.group(1).strip()
-                logger.debug(f"    Found og:description: {description[:80]}")
+**Title**: {title}
+**URL**: {url}
+**Link Type**: {link_type}

-        # Looking for first paragraph after title
-        if not description:
-            p_match = re.search(r'<p[^>]*>([^<]+)</p>', content, re.IGNORECASE)
-            if p_match:
-                description = p_match.group(1).strip()[:200]
+**Content** (first 1500 chars):
+{clean_text[:1500]}

-        # Determine tag based on content + URL + type
-        tag = "interesting"
-        summary = ""
+---

+Create a 2-3 sentence summary that answers:
+1. What is this page about?
+2. Why would Laurent find it useful?
+
+Keep it practical and concise. Do NOT include the URL or title in the summary.
+"""
+        
+        # Call gateway with a simple POST
+        logger.debug(f"    Sending to gateway for analysis...")
+        response = requests.post(
+            "http://127.0.0.1:18789/sessions/turn",
+            json={
+                "message": prompt,
+                "session": "main"
+            },
+            timeout=15,
+            headers={"Authorization": f"Bearer {GATEWAY_TOKEN}"} if GATEWAY_TOKEN else {}
+        )
+        
+        if response.status_code == 200:
+            result = response.json()
+            # Extract the summary from response
+            summary = result.get("message", "") or result.get("content", "")
+            if isinstance(summary, list):
+                summary = summary[0].get("text", "") if summary else ""
+            summary = summary.strip()[:300]
+            
+            logger.info(f"    ✓ Got summary from gateway: {summary[:60]}")
+            
+            # Determine tag from link type
+            tag = "to-read"
            if link_type == "GitHub":
                tag = "project"
-            summary = f"GitHub repository: {title}"
-            # Try to extract more info from README
-            readme_match = re.search(r'README[^<]*</h[1-3]>[^<]*<p[^>]*>([^<]+)', content, re.IGNORECASE)
-            if readme_match:
-                summary += f". {readme_match.group(1)[:100]}"
-        
            elif link_type == "YouTube":
                tag = "video"
-            summary = f"Video: {title}"
-            if description:
-                summary += f". {description[:80]}"
-        
            elif link_type == "Reddit":
                tag = "discussion"
-            summary = f"Reddit discussion: {title}"
-        
-        elif link_type == "Medium" or link_type == "Dev.to":
+            elif link_type in ["Medium", "Dev.to"]:
                tag = "article"
-            summary = f"Article: {title}"
-            if description:
-                summary += f". {description[:80]}"
-        
            elif link_type == "arXiv":
                tag = "learning"
-            summary = f"Research paper: {title}"
            
-        else:
-            # Generic web article
-            tag = "to-read"
-            summary = title
-            if description:
-                summary += f". {description[:100]}"
-        
-        # Truncate summary to reasonable length
-        summary = summary[:200]
-        
-        logger.info(f"    ✓ Analysis complete - Tag: {tag}, Summary: {summary[:60]}")
-        
-        result = {
+            return {
                "summary": summary,
                "tag": tag,
                "relevance": "relevant"
            }
-        logger.debug(f"    Returning: {result}")
-        return result
+        else:
+            logger.warning(f"    Gateway error {response.status_code}, falling back to heuristic")
+            # Fallback: use simple heuristic
+            return {
+                "summary": extract_simple_summary(clean_text, title, link_type),
+                "tag": get_tag_from_type(link_type),
+                "relevance": "relevant"
+            }
        
+    except requests.Timeout:
+        logger.warning(f"    Gateway timeout, using fallback")
+        return {
+            "summary": extract_simple_summary(content, title, link_type),
+            "tag": get_tag_from_type(link_type),
+            "relevance": "relevant"
+        }
    except Exception as e:
        logger.error(f"    Analysis error: {e}")
        import traceback
        logger.error(traceback.format_exc())
-        # Return minimal analysis
        return {
            "summary": title,
            "tag": "interesting",
            "relevance": "relevant"
        }

+def extract_simple_summary(text, title, link_type):
+    """Fallback: extract a simple summary from text"""
+    # Get first non-empty sentence/paragraph
+    sentences = re.split(r'[.!?]', text)
+    for sent in sentences:
+        sent = sent.strip()
+        if len(sent) > 20 and len(sent) < 300:
+            return sent[:200]
+    return title
+
+def get_tag_from_type(link_type):
+    """Get tag based on link type"""
+    tags = {
+        "GitHub": "project",
+        "YouTube": "video",
+        "Reddit": "discussion",
+        "Medium": "article",
+        "Dev.to": "article",
+        "arXiv": "learning",
+        "Twitter/X": "discussion"
+    }
+    return tags.get(link_type, "to-read")
+
 # Send to Tududi inbox
 def add_to_tududi(title, url, link_type, summary="", tag=""):
    """Add to Tududi inbox with intelligent summary"""