Improve: Local heuristic-based analysis (no subprocess needed, fast)

2026-02-09 19:05:31 +01:00
parent eaaa297d9e
commit e4421d7bc9
1 changed files with 76 additions and 67 deletions
@@ -14,8 +14,6 @@ from pathlib import Path
 from dotenv import load_dotenv
 import logging
 from urllib.parse import urlparse
-import subprocess
-import sys

 # Load .env file
 load_dotenv()
@@ -159,79 +157,90 @@ def fetch_url_content(url):
        logger.error(f"    ❌ Error: {e}")
        return {"title": "Fetch failed", "status": "error", "error": str(e), "content": ""}

-# Analyze content with AI (Haiku via gateway)
+# Analyze content (local heuristic-based)
 def analyze_content(url, title, content, link_type):
-    """Analyze content and create intelligent summary"""
+    """Analyze content and suggest summary + tag locally"""
    logger.debug(f"  🤖 Analyzing content: {url}")
    
-    # Build analysis prompt
-    analysis_prompt = f"""Analyze this link and create a brief summary useful for Laurent.
-
-**Link**: {link_type} - {title}
-**URL**: {url}
-
-**Content (first 1500 chars)**:
-{content[:1500]}
-
---
-
-Respond in JSON format ONLY (no markdown, no explanation):
-{{
-  "summary": "1-2 sentences max: What is it? Why would Laurent find it useful?",
-  "tag": "one of: to-read, tool, inspiration, learning, reference, interesting, project, tutorial, article, code, security",
-  "relevance": "very-relevant OR relevant OR nice-to-have"
-}}
-
-Be concise and practical."""
-    
    try:
-        # Use OpenClaw CLI to invoke sessions_spawn
-        # This spawns a sub-agent that analyzes the content
-        result = subprocess.run(
-            [
-                sys.executable, "-m", "openclaw",
-                "sessions", "spawn",
-                "--task", analysis_prompt,
-                "--model", "openrouter/anthropic/claude-haiku-4.5",
-                "--thinking", "off",
-                "--timeout", "15"
-            ],
-            capture_output=True,
-            text=True,
-            timeout=20
-        )
+        # Extract useful info from HTML content
+        description = ""
+        
+        # Looking for meta description
+        desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', content, re.IGNORECASE)
+        if desc_match:
+            description = desc_match.group(1).strip()
+        
+        # Looking for og:description
+        if not description:
+            og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]+)"', content, re.IGNORECASE)
+            if og_desc:
+                description = og_desc.group(1).strip()
+        
+        # Looking for first paragraph after title
+        if not description:
+            p_match = re.search(r'<p[^>]*>([^<]+)</p>', content, re.IGNORECASE)
+            if p_match:
+                description = p_match.group(1).strip()[:200]
+        
+        # Determine tag based on content + URL + type
+        tag = "interesting"
+        summary = ""
+        
+        if link_type == "GitHub":
+            tag = "project"
+            summary = f"GitHub repository: {title}"
+            # Try to extract more info from README
+            readme_match = re.search(r'README[^<]*</h[1-3]>[^<]*<p[^>]*>([^<]+)', content, re.IGNORECASE)
+            if readme_match:
+                summary += f". {readme_match.group(1)[:100]}"
+        
+        elif link_type == "YouTube":
+            tag = "video"
+            summary = f"Video: {title}"
+            if description:
+                summary += f". {description[:80]}"
+        
+        elif link_type == "Reddit":
+            tag = "discussion"
+            summary = f"Reddit discussion: {title}"
+        
+        elif link_type == "Medium" or link_type == "Dev.to":
+            tag = "article"
+            summary = f"Article: {title}"
+            if description:
+                summary += f". {description[:80]}"
+        
+        elif link_type == "arXiv":
+            tag = "learning"
+            summary = f"Research paper: {title}"
        
-        if result.returncode == 0:
-            output = result.stdout
-            logger.debug(f"    Sub-agent response: {output[:200]}")
-            
-            # Try to parse JSON
-            try:
-                json_match = re.search(r'\{[^{}]*"summary"[^{}]*\}', output, re.DOTALL)
-                if json_match:
-                    analysis_data = json.loads(json_match.group())
-                    logger.debug(f"    ✓ Analysis parsed successfully")
-                    return analysis_data
-            except json.JSONDecodeError:
-                pass
-            
-            # Fallback: extract summary from text
-            summary_line = output.split('\n')[0][:200]
-            return {
-                "summary": summary_line,
-                "tag": "interesting",
-                "relevance": "relevant"
-            }
        else:
-            logger.warning(f"    Sub-agent error: {result.stderr[:200]}")
-            return None
+            # Generic web article
+            tag = "to-read"
+            summary = title
+            if description:
+                summary += f". {description[:100]}"
+        
+        # Truncate summary to reasonable length
+        summary = summary[:200]
+        
+        logger.debug(f"    ✓ Tag: {tag}, Summary: {summary[:80]}")
+        
+        return {
+            "summary": summary,
+            "tag": tag,
+            "relevance": "relevant"
+        }
        
-    except subprocess.TimeoutExpired:
-        logger.warning(f"    Analysis timeout")
-        return None
    except Exception as e:
-        logger.warning(f"    Analysis error: {e}")
-        return None
+        logger.error(f"    Analysis error: {e}")
+        # Return minimal analysis
+        return {
+            "summary": title,
+            "tag": "interesting",
+            "relevance": "relevant"
+        }

 # Send to Tududi inbox
 def add_to_tududi(title, url, link_type, summary="", tag=""):