Cortex-Inara/cortex/tools/web.py

"""
Web tools — search (DuckDuckGo), direct HTTP fetch, clean content extraction, and HTTP POST.
"""

import asyncio
import json
import logging
from urllib.parse import urlparse

import httpx
from google.genai import types

from config import settings
from persona import get_user

logger = logging.getLogger(__name__)


async def search(query: str, max_results: int | None = None) -> str:
    """Search DuckDuckGo and return results as a formatted string.

    Returns a markdown-formatted list of results: title, URL, and snippet.
    The orchestrator includes this in the context it passes to Claude.
    """
    n = min(max_results or settings.ddg_max_results, 10)
    results = await asyncio.to_thread(_sync_search, query, n)
    if not results:
        return f"No results found for: {query}"

    lines = [f"Search results for: **{query}**\n"]
    for i, r in enumerate(results, 1):
        lines.append(f"{i}. [{r['title']}]({r['href']})")
        if r.get("body"):
            lines.append(f"   {r['body']}")
        lines.append("")

    return "\n".join(lines).strip()


def _sync_search(query: str, max_results: int) -> list[dict]:
    """Synchronous DuckDuckGo search — run via asyncio.to_thread."""
    from ddgs import DDGS

    kwargs = {}
    if settings.ddg_api_key:
        # Paid account — pass token for higher rate limits
        kwargs["headers"] = {"Authorization": f"Bearer {settings.ddg_api_key}"}

    try:
        with DDGS(**kwargs) as ddgs:
            return list(ddgs.text(query, max_results=max_results))
    except Exception as e:
        logger.warning("DuckDuckGo search error: %s", e)
        return []


async def http_fetch(
    url: str,
    method: str = "GET",
    body: str | None = None,
    timeout: int = 15,
    max_chars: int = 8192,
) -> str:
    """Fetch a URL directly and return the raw response body.

    Unlike web_search, this hits a specific URL — useful for health checks,
    API probing, JSON endpoints, webhook testing, or reading raw page source.
    For readable article content, use web_read instead.
    Response body is capped at max_chars (default 8192, max 32768).
    """
    method = method.upper()
    timeout = min(max(int(timeout), 1), 60)
    max_chars = min(max(int(max_chars), 100), 131072)
    try:
        async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
            resp = await client.request(method, url, content=body)
            body_text = resp.text[:max_chars]
            truncated = len(resp.text) > max_chars
            suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else ""
            return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}"
    except httpx.HTTPError as e:
        return f"HTTP error: {e}"
    except Exception as e:
        logger.warning("http_fetch error for %s: %s", url, e)
        return f"Error: {e}"


async def web_read(url: str, max_chars: int = 16000) -> str:
    """Fetch a URL and extract clean readable text, stripping ads, navigation, and boilerplate.

    Uses trafilatura to extract the main article content — ideal for blog posts,
    documentation, news articles, and any page where you want the text without
    surrounding noise. Returns markdown-formatted output.
    For raw responses (JSON APIs, health checks), use http_fetch instead.
    """
    max_chars = min(max(int(max_chars), 1000), 131072)
    return await asyncio.to_thread(_sync_web_read, url, max_chars)


def _sync_web_read(url: str, max_chars: int) -> str:
    try:
        import trafilatura
    except ImportError:
        return "web_read requires trafilatura — run: pip install trafilatura"

    downloaded = trafilatura.fetch_url(url)
    if downloaded is None:
        return f"Failed to download content from: {url}"

    text = trafilatura.extract(downloaded, output_format="markdown", include_links=True, url=url)
    if not text:
        text = trafilatura.extract(downloaded, url=url)
    if not text:
        return f"Could not extract readable content from: {url}"

    if len(text) > max_chars:
        text = text[:max_chars] + f"\n\n[… truncated at {max_chars} chars — pass a larger max_chars (up to 131072) to see more]"
    return f"Content from {url}:\n\n{text}"


def _load_http_allowlist(username: str) -> list[str]:
    """Load per-user HTTP POST allowlist (URL prefixes). Empty list = all blocked."""
    path = settings.home_root() / username / "http_allowlist.json"
    try:
        return [str(p).strip() for p in json.loads(path.read_text()) if str(p).strip()]
    except FileNotFoundError:
        return []
    except Exception as e:
        logger.warning("failed to read http_allowlist.json for %s: %s", username, e)
        return []


def _http_post_allowed(url: str, allowlist: list[str]) -> bool:
    """Return True if url starts with any allowlist entry (prefix match)."""
    for prefix in allowlist:
        if url.startswith(prefix):
            return True
    return False


async def http_post(
    url: str,
    body: str = "",
    headers: dict | None = None,
    max_chars: int = 4096,
) -> str:
    """POST to an external URL. Requires the URL to match home/{user}/http_allowlist.json.

    body may be a JSON string or plain text. If body is valid JSON, Content-Type is set
    to application/json; otherwise text/plain. Override via the headers param.
    Response is capped at max_chars (default 4096, max 131072).
    """
    username = get_user()
    allowlist = _load_http_allowlist(username)
    if not allowlist:
        return (
            f"http_post blocked — no allowlist configured. "
            f"Add allowed URL prefixes to home/{username}/http_allowlist.json as a JSON array. "
            f"Example: [\"https://api.example.com\"]"
        )
    if not _http_post_allowed(url, allowlist):
        return (
            f"http_post blocked — {url} does not match any allowlist entry for {username}. "
            f"Add the URL prefix to home/{username}/http_allowlist.json."
        )

    max_chars = min(max(int(max_chars), 100), 131072)

    # Auto-detect content type from body
    body_str = body if isinstance(body, str) else json.dumps(body)
    try:
        json.loads(body_str)
        content_type = "application/json"
    except (json.JSONDecodeError, ValueError):
        content_type = "text/plain"

    req_headers = {"Content-Type": content_type}
    if headers:
        req_headers.update(headers)

    try:
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            resp = await client.post(url, content=body_str.encode(), headers=req_headers)
            body_text = resp.text[:max_chars]
            truncated = len(resp.text) > max_chars
            suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else ""
            return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}"
    except httpx.HTTPError as e:
        return f"HTTP error: {e}"
    except Exception as e:
        logger.warning("http_post error for %s: %s", url, e)
        return f"Error: {e}"


DECLARATIONS = [
    types.FunctionDeclaration(
        name="web_search",
        description=(
            "Search the web for current information. Use this when you need up-to-date "
            "facts, news, documentation, or anything not in your training data."
        ),
        parameters=types.Schema(
            type=types.Type.OBJECT,
            properties={
                "query": types.Schema(type=types.Type.STRING, description="The search query string"),
                "max_results": types.Schema(type=types.Type.INTEGER, description="Number of results to return (default 5, max 10)"),
            },
            required=["query"],
        ),
    ),
    types.FunctionDeclaration(
        name="http_fetch",
        description=(
            "Fetch a specific URL and return the raw response body. Unlike web_search, this hits "
            "a direct URL — useful for health checks, JSON API endpoints, webhook testing, "
            "or inspecting raw page source. For readable article/doc content, use web_read instead. "
            "Response body is capped at max_chars (default 8192, max 32768)."
        ),
        parameters=types.Schema(
            type=types.Type.OBJECT,
            properties={
                "url": types.Schema(type=types.Type.STRING, description="Full URL to fetch"),
                "method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"),
                "body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"),
                "timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"),
                "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 8192, max 131072)"),
            },
            required=["url"],
        ),
    ),
    types.FunctionDeclaration(
        name="web_read",
        description=(
            "Fetch a URL and extract clean readable text, stripping ads, navigation, sidebars, "
            "and other boilerplate. Returns the main article/document content as markdown. "
            "Use this for blog posts, documentation, news articles, GitHub READMEs, or any page "
            "where you want the content without surrounding noise. "
            "For raw HTTP responses (JSON APIs, health checks, source inspection), use http_fetch."
        ),
        parameters=types.Schema(
            type=types.Type.OBJECT,
            properties={
                "url": types.Schema(type=types.Type.STRING, description="Full URL to fetch and extract"),
                "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 16000, max 131072)"),
            },
            required=["url"],
        ),
    ),
    types.FunctionDeclaration(
        name="http_post",
        description=(
            "POST to an external URL. Requires the URL to match the user's http_allowlist.json. "
            "Use for calling webhooks, triggering automations, posting to APIs, or any HTTP action. "
            "body is a string — JSON or plain text are both accepted (Content-Type auto-detected). "
            "Override headers as needed. Response capped at max_chars (default 4096, max 131072)."
        ),
        parameters=types.Schema(
            type=types.Type.OBJECT,
            properties={
                "url":       types.Schema(type=types.Type.STRING, description="Full URL to POST to"),
                "body":      types.Schema(type=types.Type.STRING, description="Request body — JSON string or plain text"),
                "max_chars": types.Schema(type=types.Type.INTEGER, description="Max response chars (default 4096, max 131072)"),
            },
            required=["url"],
        ),
    ),
]