""" Web tools — search (DuckDuckGo), direct HTTP fetch, and clean content extraction. """ import asyncio import logging import httpx from google.genai import types from config import settings logger = logging.getLogger(__name__) async def search(query: str, max_results: int | None = None) -> str: """Search DuckDuckGo and return results as a formatted string. Returns a markdown-formatted list of results: title, URL, and snippet. The orchestrator includes this in the context it passes to Claude. """ n = min(max_results or settings.ddg_max_results, 10) results = await asyncio.to_thread(_sync_search, query, n) if not results: return f"No results found for: {query}" lines = [f"Search results for: **{query}**\n"] for i, r in enumerate(results, 1): lines.append(f"{i}. [{r['title']}]({r['href']})") if r.get("body"): lines.append(f" {r['body']}") lines.append("") return "\n".join(lines).strip() def _sync_search(query: str, max_results: int) -> list[dict]: """Synchronous DuckDuckGo search — run via asyncio.to_thread.""" from ddgs import DDGS kwargs = {} if settings.ddg_api_key: # Paid account — pass token for higher rate limits kwargs["headers"] = {"Authorization": f"Bearer {settings.ddg_api_key}"} try: with DDGS(**kwargs) as ddgs: return list(ddgs.text(query, max_results=max_results)) except Exception as e: logger.warning("DuckDuckGo search error: %s", e) return [] async def http_fetch( url: str, method: str = "GET", body: str | None = None, timeout: int = 15, max_chars: int = 8192, ) -> str: """Fetch a URL directly and return the raw response body. Unlike web_search, this hits a specific URL — useful for health checks, API probing, JSON endpoints, webhook testing, or reading raw page source. For readable article content, use web_read instead. Response body is capped at max_chars (default 8192, max 32768). """ method = method.upper() timeout = min(max(int(timeout), 1), 60) max_chars = min(max(int(max_chars), 100), 131072) try: async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: resp = await client.request(method, url, content=body) body_text = resp.text[:max_chars] truncated = len(resp.text) > max_chars suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else "" return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}" except httpx.HTTPError as e: return f"HTTP error: {e}" except Exception as e: logger.warning("http_fetch error for %s: %s", url, e) return f"Error: {e}" async def web_read(url: str, max_chars: int = 16000) -> str: """Fetch a URL and extract clean readable text, stripping ads, navigation, and boilerplate. Uses trafilatura to extract the main article content — ideal for blog posts, documentation, news articles, and any page where you want the text without surrounding noise. Returns markdown-formatted output. For raw responses (JSON APIs, health checks), use http_fetch instead. """ max_chars = min(max(int(max_chars), 1000), 131072) return await asyncio.to_thread(_sync_web_read, url, max_chars) def _sync_web_read(url: str, max_chars: int) -> str: try: import trafilatura except ImportError: return "web_read requires trafilatura — run: pip install trafilatura" downloaded = trafilatura.fetch_url(url) if downloaded is None: return f"Failed to download content from: {url}" text = trafilatura.extract(downloaded, output_format="markdown", include_links=True, url=url) if not text: text = trafilatura.extract(downloaded, url=url) if not text: return f"Could not extract readable content from: {url}" if len(text) > max_chars: text = text[:max_chars] + f"\n\n[… truncated at {max_chars} chars — pass a larger max_chars (up to 131072) to see more]" return f"Content from {url}:\n\n{text}" DECLARATIONS = [ types.FunctionDeclaration( name="web_search", description=( "Search the web for current information. Use this when you need up-to-date " "facts, news, documentation, or anything not in your training data." ), parameters=types.Schema( type=types.Type.OBJECT, properties={ "query": types.Schema(type=types.Type.STRING, description="The search query string"), "max_results": types.Schema(type=types.Type.INTEGER, description="Number of results to return (default 5, max 10)"), }, required=["query"], ), ), types.FunctionDeclaration( name="http_fetch", description=( "Fetch a specific URL and return the raw response body. Unlike web_search, this hits " "a direct URL — useful for health checks, JSON API endpoints, webhook testing, " "or inspecting raw page source. For readable article/doc content, use web_read instead. " "Response body is capped at max_chars (default 8192, max 32768)." ), parameters=types.Schema( type=types.Type.OBJECT, properties={ "url": types.Schema(type=types.Type.STRING, description="Full URL to fetch"), "method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"), "body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"), "timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"), "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 8192, max 131072)"), }, required=["url"], ), ), types.FunctionDeclaration( name="web_read", description=( "Fetch a URL and extract clean readable text, stripping ads, navigation, sidebars, " "and other boilerplate. Returns the main article/document content as markdown. " "Use this for blog posts, documentation, news articles, GitHub READMEs, or any page " "where you want the content without surrounding noise. " "For raw HTTP responses (JSON APIs, health checks, source inspection), use http_fetch." ), parameters=types.Schema( type=types.Type.OBJECT, properties={ "url": types.Schema(type=types.Type.STRING, description="Full URL to fetch and extract"), "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 16000, max 131072)"), }, required=["url"], ), ), ]