Cortex-Inara/cortex/tools/web.py

"""
Web tools — search (DuckDuckGo), direct HTTP fetch, and clean content extraction.
"""

import asyncio
import logging

import httpx
from google.genai import types

from config import settings

logger = logging.getLogger(__name__)


async def search(query: str, max_results: int | None = None) -> str:
    """Search DuckDuckGo and return results as a formatted string.

    Returns a markdown-formatted list of results: title, URL, and snippet.
    The orchestrator includes this in the context it passes to Claude.
    """
    n = min(max_results or settings.ddg_max_results, 10)
    results = await asyncio.to_thread(_sync_search, query, n)
    if not results:
        return f"No results found for: {query}"

    lines = [f"Search results for: **{query}**\n"]
    for i, r in enumerate(results, 1):
        lines.append(f"{i}. [{r['title']}]({r['href']})")
        if r.get("body"):
            lines.append(f"   {r['body']}")
        lines.append("")

    return "\n".join(lines).strip()


def _sync_search(query: str, max_results: int) -> list[dict]:
    """Synchronous DuckDuckGo search — run via asyncio.to_thread."""
    from ddgs import DDGS

    kwargs = {}
    if settings.ddg_api_key:
        # Paid account — pass token for higher rate limits
        kwargs["headers"] = {"Authorization": f"Bearer {settings.ddg_api_key}"}

    try:
        with DDGS(**kwargs) as ddgs:
            return list(ddgs.text(query, max_results=max_results))
    except Exception as e:
        logger.warning("DuckDuckGo search error: %s", e)
        return []


async def http_fetch(
    url: str,
    method: str = "GET",
    body: str | None = None,
    timeout: int = 15,
    max_chars: int = 8192,
) -> str:
    """Fetch a URL directly and return the raw response body.

    Unlike web_search, this hits a specific URL — useful for health checks,
    API probing, JSON endpoints, webhook testing, or reading raw page source.
    For readable article content, use web_read instead.
    Response body is capped at max_chars (default 8192, max 32768).
    """
    method = method.upper()
    timeout = min(max(int(timeout), 1), 60)
    max_chars = min(max(int(max_chars), 100), 32768)
    try:
        async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
            resp = await client.request(method, url, content=body)
            body_text = resp.text[:max_chars]
            truncated = len(resp.text) > max_chars
            suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else ""
            return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}"
    except httpx.HTTPError as e:
        return f"HTTP error: {e}"
    except Exception as e:
        logger.warning("http_fetch error for %s: %s", url, e)
        return f"Error: {e}"


async def web_read(url: str, max_chars: int = 16000) -> str:
    """Fetch a URL and extract clean readable text, stripping ads, navigation, and boilerplate.

    Uses trafilatura to extract the main article content — ideal for blog posts,
    documentation, news articles, and any page where you want the text without
    surrounding noise. Returns markdown-formatted output.
    For raw responses (JSON APIs, health checks), use http_fetch instead.
    """
    max_chars = min(max(int(max_chars), 1000), 32000)
    return await asyncio.to_thread(_sync_web_read, url, max_chars)


def _sync_web_read(url: str, max_chars: int) -> str:
    try:
        import trafilatura
    except ImportError:
        return "web_read requires trafilatura — run: pip install trafilatura"

    downloaded = trafilatura.fetch_url(url)
    if downloaded is None:
        return f"Failed to download content from: {url}"

    text = trafilatura.extract(downloaded, output_format="markdown", include_links=True, url=url)
    if not text:
        text = trafilatura.extract(downloaded, url=url)
    if not text:
        return f"Could not extract readable content from: {url}"

    if len(text) > max_chars:
        text = text[:max_chars] + f"\n\n[… truncated at {max_chars} chars — pass a larger max_chars to see more]"
    return f"Content from {url}:\n\n{text}"


DECLARATIONS = [
    types.FunctionDeclaration(
        name="web_search",
        description=(
            "Search the web for current information. Use this when you need up-to-date "
            "facts, news, documentation, or anything not in your training data."
        ),
        parameters=types.Schema(
            type=types.Type.OBJECT,
            properties={
                "query": types.Schema(type=types.Type.STRING, description="The search query string"),
                "max_results": types.Schema(type=types.Type.INTEGER, description="Number of results to return (default 5, max 10)"),
            },
            required=["query"],
        ),
    ),
    types.FunctionDeclaration(
        name="http_fetch",
        description=(
            "Fetch a specific URL and return the raw response body. Unlike web_search, this hits "
            "a direct URL — useful for health checks, JSON API endpoints, webhook testing, "
            "or inspecting raw page source. For readable article/doc content, use web_read instead. "
            "Response body is capped at max_chars (default 8192, max 32768)."
        ),
        parameters=types.Schema(
            type=types.Type.OBJECT,
            properties={
                "url": types.Schema(type=types.Type.STRING, description="Full URL to fetch"),
                "method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"),
                "body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"),
                "timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"),
                "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 8192, max 32768)"),
            },
            required=["url"],
        ),
    ),
    types.FunctionDeclaration(
        name="web_read",
        description=(
            "Fetch a URL and extract clean readable text, stripping ads, navigation, sidebars, "
            "and other boilerplate. Returns the main article/document content as markdown. "
            "Use this for blog posts, documentation, news articles, GitHub READMEs, or any page "
            "where you want the content without surrounding noise. "
            "For raw HTTP responses (JSON APIs, health checks, source inspection), use http_fetch."
        ),
        parameters=types.Schema(
            type=types.Type.OBJECT,
            properties={
                "url": types.Schema(type=types.Type.STRING, description="Full URL to fetch and extract"),
                "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 16000, max 32000)"),
            },
            required=["url"],
        ),
    ),
]