feat: web_read (trafilatura), session_read, http_fetch max_chars
web_read(url, max_chars=16000) — fetches a URL and extracts clean article text via trafilatura, stripping ads/nav/boilerplate. Returns markdown. session_read(date) — reads a full session log by YYYY-MM-DD date; lists available dates if the requested one is not found. http_fetch gains a max_chars param (default 8192, max 32768) so the cap is configurable instead of hardcoded. Tool count: 45 → 47. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Web tools — search (DuckDuckGo) and direct HTTP fetch.
|
||||
Web tools — search (DuckDuckGo), direct HTTP fetch, and clean content extraction.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -56,20 +56,25 @@ async def http_fetch(
|
||||
method: str = "GET",
|
||||
body: str | None = None,
|
||||
timeout: int = 15,
|
||||
max_chars: int = 8192,
|
||||
) -> str:
|
||||
"""Fetch a URL directly and return the response body.
|
||||
"""Fetch a URL directly and return the raw response body.
|
||||
|
||||
Unlike web_search, this hits a specific URL — useful for health checks,
|
||||
API probing, JSON endpoints, webhook testing, etc.
|
||||
Response body is capped at 8 KB.
|
||||
API probing, JSON endpoints, webhook testing, or reading raw page source.
|
||||
For readable article content, use web_read instead.
|
||||
Response body is capped at max_chars (default 8192, max 32768).
|
||||
"""
|
||||
method = method.upper()
|
||||
timeout = min(max(int(timeout), 1), 60)
|
||||
max_chars = min(max(int(max_chars), 100), 32768)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||
resp = await client.request(method, url, content=body)
|
||||
body_text = resp.text[:8192]
|
||||
return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}"
|
||||
body_text = resp.text[:max_chars]
|
||||
truncated = len(resp.text) > max_chars
|
||||
suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else ""
|
||||
return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}"
|
||||
except httpx.HTTPError as e:
|
||||
return f"HTTP error: {e}"
|
||||
except Exception as e:
|
||||
@@ -77,6 +82,39 @@ async def http_fetch(
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
async def web_read(url: str, max_chars: int = 16000) -> str:
|
||||
"""Fetch a URL and extract clean readable text, stripping ads, navigation, and boilerplate.
|
||||
|
||||
Uses trafilatura to extract the main article content — ideal for blog posts,
|
||||
documentation, news articles, and any page where you want the text without
|
||||
surrounding noise. Returns markdown-formatted output.
|
||||
For raw responses (JSON APIs, health checks), use http_fetch instead.
|
||||
"""
|
||||
max_chars = min(max(int(max_chars), 1000), 32000)
|
||||
return await asyncio.to_thread(_sync_web_read, url, max_chars)
|
||||
|
||||
|
||||
def _sync_web_read(url: str, max_chars: int) -> str:
|
||||
try:
|
||||
import trafilatura
|
||||
except ImportError:
|
||||
return "web_read requires trafilatura — run: pip install trafilatura"
|
||||
|
||||
downloaded = trafilatura.fetch_url(url)
|
||||
if downloaded is None:
|
||||
return f"Failed to download content from: {url}"
|
||||
|
||||
text = trafilatura.extract(downloaded, output_format="markdown", include_links=True, url=url)
|
||||
if not text:
|
||||
text = trafilatura.extract(downloaded, url=url)
|
||||
if not text:
|
||||
return f"Could not extract readable content from: {url}"
|
||||
|
||||
if len(text) > max_chars:
|
||||
text = text[:max_chars] + f"\n\n[… truncated at {max_chars} chars — pass a larger max_chars to see more]"
|
||||
return f"Content from {url}:\n\n{text}"
|
||||
|
||||
|
||||
DECLARATIONS = [
|
||||
types.FunctionDeclaration(
|
||||
name="web_search",
|
||||
@@ -96,10 +134,10 @@ DECLARATIONS = [
|
||||
types.FunctionDeclaration(
|
||||
name="http_fetch",
|
||||
description=(
|
||||
"Fetch a specific URL and return the response. Unlike web_search, this hits "
|
||||
"Fetch a specific URL and return the raw response body. Unlike web_search, this hits "
|
||||
"a direct URL — useful for health checks, JSON API endpoints, webhook testing, "
|
||||
"or reading a specific page when you already know the URL. "
|
||||
"Response body is capped at 8 KB."
|
||||
"or inspecting raw page source. For readable article/doc content, use web_read instead. "
|
||||
"Response body is capped at max_chars (default 8192, max 32768)."
|
||||
),
|
||||
parameters=types.Schema(
|
||||
type=types.Type.OBJECT,
|
||||
@@ -108,6 +146,25 @@ DECLARATIONS = [
|
||||
"method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"),
|
||||
"body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"),
|
||||
"timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"),
|
||||
"max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 8192, max 32768)"),
|
||||
},
|
||||
required=["url"],
|
||||
),
|
||||
),
|
||||
types.FunctionDeclaration(
|
||||
name="web_read",
|
||||
description=(
|
||||
"Fetch a URL and extract clean readable text, stripping ads, navigation, sidebars, "
|
||||
"and other boilerplate. Returns the main article/document content as markdown. "
|
||||
"Use this for blog posts, documentation, news articles, GitHub READMEs, or any page "
|
||||
"where you want the content without surrounding noise. "
|
||||
"For raw HTTP responses (JSON APIs, health checks, source inspection), use http_fetch."
|
||||
),
|
||||
parameters=types.Schema(
|
||||
type=types.Type.OBJECT,
|
||||
properties={
|
||||
"url": types.Schema(type=types.Type.STRING, description="Full URL to fetch and extract"),
|
||||
"max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 16000, max 32000)"),
|
||||
},
|
||||
required=["url"],
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user