- http_post: POST to external URLs with per-user URL prefix allowlist
(home/{user}/http_allowlist.json); admin-only, confirm-required
- nc_talk_history: read recent NC Talk messages via Basic Auth (requires
nc_username + nc_app_password in channels.json under nextcloud)
- openai_orchestrator: _chat_with_retry() wraps both API calls with
exponential backoff (3 attempts, 1s/2s) on connection errors and
transient status codes (429, 500, 502, 503, 504)
- Docs updated: CLAUDE.md, HELP.md, TODO, MASTER, ROADMAP (50 tools)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
268 lines
11 KiB
Python
268 lines
11 KiB
Python
"""
|
|
Web tools — search (DuckDuckGo), direct HTTP fetch, clean content extraction, and HTTP POST.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
from google.genai import types
|
|
|
|
from config import settings
|
|
from persona import get_user
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def search(query: str, max_results: int | None = None) -> str:
|
|
"""Search DuckDuckGo and return results as a formatted string.
|
|
|
|
Returns a markdown-formatted list of results: title, URL, and snippet.
|
|
The orchestrator includes this in the context it passes to Claude.
|
|
"""
|
|
n = min(max_results or settings.ddg_max_results, 10)
|
|
results = await asyncio.to_thread(_sync_search, query, n)
|
|
if not results:
|
|
return f"No results found for: {query}"
|
|
|
|
lines = [f"Search results for: **{query}**\n"]
|
|
for i, r in enumerate(results, 1):
|
|
lines.append(f"{i}. [{r['title']}]({r['href']})")
|
|
if r.get("body"):
|
|
lines.append(f" {r['body']}")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines).strip()
|
|
|
|
|
|
def _sync_search(query: str, max_results: int) -> list[dict]:
|
|
"""Synchronous DuckDuckGo search — run via asyncio.to_thread."""
|
|
from ddgs import DDGS
|
|
|
|
kwargs = {}
|
|
if settings.ddg_api_key:
|
|
# Paid account — pass token for higher rate limits
|
|
kwargs["headers"] = {"Authorization": f"Bearer {settings.ddg_api_key}"}
|
|
|
|
try:
|
|
with DDGS(**kwargs) as ddgs:
|
|
return list(ddgs.text(query, max_results=max_results))
|
|
except Exception as e:
|
|
logger.warning("DuckDuckGo search error: %s", e)
|
|
return []
|
|
|
|
|
|
async def http_fetch(
|
|
url: str,
|
|
method: str = "GET",
|
|
body: str | None = None,
|
|
timeout: int = 15,
|
|
max_chars: int = 8192,
|
|
) -> str:
|
|
"""Fetch a URL directly and return the raw response body.
|
|
|
|
Unlike web_search, this hits a specific URL — useful for health checks,
|
|
API probing, JSON endpoints, webhook testing, or reading raw page source.
|
|
For readable article content, use web_read instead.
|
|
Response body is capped at max_chars (default 8192, max 32768).
|
|
"""
|
|
method = method.upper()
|
|
timeout = min(max(int(timeout), 1), 60)
|
|
max_chars = min(max(int(max_chars), 100), 131072)
|
|
try:
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
|
resp = await client.request(method, url, content=body)
|
|
body_text = resp.text[:max_chars]
|
|
truncated = len(resp.text) > max_chars
|
|
suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else ""
|
|
return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}"
|
|
except httpx.HTTPError as e:
|
|
return f"HTTP error: {e}"
|
|
except Exception as e:
|
|
logger.warning("http_fetch error for %s: %s", url, e)
|
|
return f"Error: {e}"
|
|
|
|
|
|
async def web_read(url: str, max_chars: int = 16000) -> str:
|
|
"""Fetch a URL and extract clean readable text, stripping ads, navigation, and boilerplate.
|
|
|
|
Uses trafilatura to extract the main article content — ideal for blog posts,
|
|
documentation, news articles, and any page where you want the text without
|
|
surrounding noise. Returns markdown-formatted output.
|
|
For raw responses (JSON APIs, health checks), use http_fetch instead.
|
|
"""
|
|
max_chars = min(max(int(max_chars), 1000), 131072)
|
|
return await asyncio.to_thread(_sync_web_read, url, max_chars)
|
|
|
|
|
|
def _sync_web_read(url: str, max_chars: int) -> str:
|
|
try:
|
|
import trafilatura
|
|
except ImportError:
|
|
return "web_read requires trafilatura — run: pip install trafilatura"
|
|
|
|
downloaded = trafilatura.fetch_url(url)
|
|
if downloaded is None:
|
|
return f"Failed to download content from: {url}"
|
|
|
|
text = trafilatura.extract(downloaded, output_format="markdown", include_links=True, url=url)
|
|
if not text:
|
|
text = trafilatura.extract(downloaded, url=url)
|
|
if not text:
|
|
return f"Could not extract readable content from: {url}"
|
|
|
|
if len(text) > max_chars:
|
|
text = text[:max_chars] + f"\n\n[… truncated at {max_chars} chars — pass a larger max_chars (up to 131072) to see more]"
|
|
return f"Content from {url}:\n\n{text}"
|
|
|
|
|
|
def _load_http_allowlist(username: str) -> list[str]:
|
|
"""Load per-user HTTP POST allowlist (URL prefixes). Empty list = all blocked."""
|
|
path = settings.home_root() / username / "http_allowlist.json"
|
|
try:
|
|
return [str(p).strip() for p in json.loads(path.read_text()) if str(p).strip()]
|
|
except FileNotFoundError:
|
|
return []
|
|
except Exception as e:
|
|
logger.warning("failed to read http_allowlist.json for %s: %s", username, e)
|
|
return []
|
|
|
|
|
|
def _http_post_allowed(url: str, allowlist: list[str]) -> bool:
|
|
"""Return True if url starts with any allowlist entry (prefix match)."""
|
|
for prefix in allowlist:
|
|
if url.startswith(prefix):
|
|
return True
|
|
return False
|
|
|
|
|
|
async def http_post(
|
|
url: str,
|
|
body: str = "",
|
|
headers: dict | None = None,
|
|
max_chars: int = 4096,
|
|
) -> str:
|
|
"""POST to an external URL. Requires the URL to match home/{user}/http_allowlist.json.
|
|
|
|
body may be a JSON string or plain text. If body is valid JSON, Content-Type is set
|
|
to application/json; otherwise text/plain. Override via the headers param.
|
|
Response is capped at max_chars (default 4096, max 131072).
|
|
"""
|
|
username = get_user()
|
|
allowlist = _load_http_allowlist(username)
|
|
if not allowlist:
|
|
return (
|
|
f"http_post blocked — no allowlist configured. "
|
|
f"Add allowed URL prefixes to home/{username}/http_allowlist.json as a JSON array. "
|
|
f"Example: [\"https://api.example.com\"]"
|
|
)
|
|
if not _http_post_allowed(url, allowlist):
|
|
return (
|
|
f"http_post blocked — {url} does not match any allowlist entry for {username}. "
|
|
f"Add the URL prefix to home/{username}/http_allowlist.json."
|
|
)
|
|
|
|
max_chars = min(max(int(max_chars), 100), 131072)
|
|
|
|
# Auto-detect content type from body
|
|
body_str = body if isinstance(body, str) else json.dumps(body)
|
|
try:
|
|
json.loads(body_str)
|
|
content_type = "application/json"
|
|
except (json.JSONDecodeError, ValueError):
|
|
content_type = "text/plain"
|
|
|
|
req_headers = {"Content-Type": content_type}
|
|
if headers:
|
|
req_headers.update(headers)
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
resp = await client.post(url, content=body_str.encode(), headers=req_headers)
|
|
body_text = resp.text[:max_chars]
|
|
truncated = len(resp.text) > max_chars
|
|
suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else ""
|
|
return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}"
|
|
except httpx.HTTPError as e:
|
|
return f"HTTP error: {e}"
|
|
except Exception as e:
|
|
logger.warning("http_post error for %s: %s", url, e)
|
|
return f"Error: {e}"
|
|
|
|
|
|
DECLARATIONS = [
|
|
types.FunctionDeclaration(
|
|
name="web_search",
|
|
description=(
|
|
"Search the web for current information. Use this when you need up-to-date "
|
|
"facts, news, documentation, or anything not in your training data."
|
|
),
|
|
parameters=types.Schema(
|
|
type=types.Type.OBJECT,
|
|
properties={
|
|
"query": types.Schema(type=types.Type.STRING, description="The search query string"),
|
|
"max_results": types.Schema(type=types.Type.INTEGER, description="Number of results to return (default 5, max 10)"),
|
|
},
|
|
required=["query"],
|
|
),
|
|
),
|
|
types.FunctionDeclaration(
|
|
name="http_fetch",
|
|
description=(
|
|
"Fetch a specific URL and return the raw response body. Unlike web_search, this hits "
|
|
"a direct URL — useful for health checks, JSON API endpoints, webhook testing, "
|
|
"or inspecting raw page source. For readable article/doc content, use web_read instead. "
|
|
"Response body is capped at max_chars (default 8192, max 32768)."
|
|
),
|
|
parameters=types.Schema(
|
|
type=types.Type.OBJECT,
|
|
properties={
|
|
"url": types.Schema(type=types.Type.STRING, description="Full URL to fetch"),
|
|
"method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"),
|
|
"body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"),
|
|
"timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"),
|
|
"max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 8192, max 131072)"),
|
|
},
|
|
required=["url"],
|
|
),
|
|
),
|
|
types.FunctionDeclaration(
|
|
name="web_read",
|
|
description=(
|
|
"Fetch a URL and extract clean readable text, stripping ads, navigation, sidebars, "
|
|
"and other boilerplate. Returns the main article/document content as markdown. "
|
|
"Use this for blog posts, documentation, news articles, GitHub READMEs, or any page "
|
|
"where you want the content without surrounding noise. "
|
|
"For raw HTTP responses (JSON APIs, health checks, source inspection), use http_fetch."
|
|
),
|
|
parameters=types.Schema(
|
|
type=types.Type.OBJECT,
|
|
properties={
|
|
"url": types.Schema(type=types.Type.STRING, description="Full URL to fetch and extract"),
|
|
"max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 16000, max 131072)"),
|
|
},
|
|
required=["url"],
|
|
),
|
|
),
|
|
types.FunctionDeclaration(
|
|
name="http_post",
|
|
description=(
|
|
"POST to an external URL. Requires the URL to match the user's http_allowlist.json. "
|
|
"Use for calling webhooks, triggering automations, posting to APIs, or any HTTP action. "
|
|
"body is a string — JSON or plain text are both accepted (Content-Type auto-detected). "
|
|
"Override headers as needed. Response capped at max_chars (default 4096, max 131072)."
|
|
),
|
|
parameters=types.Schema(
|
|
type=types.Type.OBJECT,
|
|
properties={
|
|
"url": types.Schema(type=types.Type.STRING, description="Full URL to POST to"),
|
|
"body": types.Schema(type=types.Type.STRING, description="Request body — JSON string or plain text"),
|
|
"max_chars": types.Schema(type=types.Type.INTEGER, description="Max response chars (default 4096, max 131072)"),
|
|
},
|
|
required=["url"],
|
|
),
|
|
),
|
|
]
|