feat: web_read (trafilatura), session_read, http_fetch max_chars
web_read(url, max_chars=16000) — fetches a URL and extracts clean article text via trafilatura, stripping ads/nav/boilerplate. Returns markdown. session_read(date) — reads a full session log by YYYY-MM-DD date; lists available dates if the requested one is not found. http_fetch gains a max_chars param (default 8192, max 32768) so the cap is configurable instead of hardcoded. Tool count: 45 → 47. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,9 @@ python-multipart>=0.0.9 # required by FastAPI for Form() data
|
|||||||
# Async HTTP client — used for local OpenAI-compatible backend (Open WebUI / Ollama)
|
# Async HTTP client — used for local OpenAI-compatible backend (Open WebUI / Ollama)
|
||||||
httpx>=0.27.0
|
httpx>=0.27.0
|
||||||
|
|
||||||
|
# Web content extraction — strips ads/nav/boilerplate, returns clean article text
|
||||||
|
trafilatura>=1.6.0
|
||||||
|
|
||||||
# OpenAI-compatible client — tool calling for OpenRouter / LiteLLM / any OAI-compat host
|
# OpenAI-compatible client — tool calling for OpenRouter / LiteLLM / any OAI-compat host
|
||||||
openai>=1.0.0
|
openai>=1.0.0
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from google.genai import types
|
|||||||
|
|
||||||
# ── Callable imports ──────────────────────────────────────────────────────────
|
# ── Callable imports ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
from tools.web import search as _web_search, http_fetch as _http_fetch
|
from tools.web import search as _web_search, http_fetch as _http_fetch, web_read as _web_read
|
||||||
from tools.ae_knowledge import (
|
from tools.ae_knowledge import (
|
||||||
journal_list as _ae_journal_list,
|
journal_list as _ae_journal_list,
|
||||||
journal_search as _ae_journal_search,
|
journal_search as _ae_journal_search,
|
||||||
@@ -30,7 +30,7 @@ from tools.ae_knowledge import (
|
|||||||
journal_entry_prepend as _ae_journal_entry_prepend,
|
journal_entry_prepend as _ae_journal_entry_prepend,
|
||||||
)
|
)
|
||||||
from tools.ae_tasks import task_list as _ae_task_list
|
from tools.ae_tasks import task_list as _ae_task_list
|
||||||
from tools.files import file_read as _file_read, file_list as _file_list, file_write as _file_write, session_search as _session_search
|
from tools.files import file_read as _file_read, file_list as _file_list, file_write as _file_write, session_search as _session_search, session_read as _session_read
|
||||||
from tools.system import (
|
from tools.system import (
|
||||||
shell_exec as _shell_exec,
|
shell_exec as _shell_exec,
|
||||||
claude_allow_dir as _claude_allow_dir,
|
claude_allow_dir as _claude_allow_dir,
|
||||||
@@ -90,8 +90,8 @@ import tools.agents as _mod_agents
|
|||||||
# ── Tool categories — used by the Model Registry UI for grouped checkboxes ───
|
# ── Tool categories — used by the Model Registry UI for grouped checkboxes ───
|
||||||
|
|
||||||
TOOL_CATEGORIES: dict[str, list[str]] = {
|
TOOL_CATEGORIES: dict[str, list[str]] = {
|
||||||
"Web": ["web_search", "http_fetch"],
|
"Web": ["web_search", "http_fetch", "web_read"],
|
||||||
"Files": ["file_read", "file_list", "file_write", "session_search"],
|
"Files": ["file_read", "file_list", "file_write", "session_read", "session_search"],
|
||||||
"Shell": ["shell_exec", "claude_allow_dir"],
|
"Shell": ["shell_exec", "claude_allow_dir"],
|
||||||
"System": ["cortex_restart", "cortex_logs", "cortex_status", "cortex_update"],
|
"System": ["cortex_restart", "cortex_logs", "cortex_status", "cortex_update"],
|
||||||
"Tasks": ["task_list", "task_create", "task_update", "task_complete"],
|
"Tasks": ["task_list", "task_create", "task_update", "task_complete"],
|
||||||
@@ -116,6 +116,7 @@ TOOL_CATEGORIES: dict[str, list[str]] = {
|
|||||||
_CALLABLES: dict[str, callable] = {
|
_CALLABLES: dict[str, callable] = {
|
||||||
"web_search": _web_search,
|
"web_search": _web_search,
|
||||||
"http_fetch": _http_fetch,
|
"http_fetch": _http_fetch,
|
||||||
|
"web_read": _web_read,
|
||||||
"ae_journal_list": _ae_journal_list,
|
"ae_journal_list": _ae_journal_list,
|
||||||
"ae_journal_search": _ae_journal_search,
|
"ae_journal_search": _ae_journal_search,
|
||||||
"ae_journal_entry_read": _ae_journal_entry_read,
|
"ae_journal_entry_read": _ae_journal_entry_read,
|
||||||
@@ -129,6 +130,7 @@ _CALLABLES: dict[str, callable] = {
|
|||||||
"file_read": _file_read,
|
"file_read": _file_read,
|
||||||
"file_list": _file_list,
|
"file_list": _file_list,
|
||||||
"file_write": _file_write,
|
"file_write": _file_write,
|
||||||
|
"session_read": _session_read,
|
||||||
"session_search": _session_search,
|
"session_search": _session_search,
|
||||||
"shell_exec": _shell_exec,
|
"shell_exec": _shell_exec,
|
||||||
"claude_allow_dir": _claude_allow_dir,
|
"claude_allow_dir": _claude_allow_dir,
|
||||||
|
|||||||
@@ -230,6 +230,34 @@ def _sync_file_write(path: str, content: str, mode: str) -> str:
|
|||||||
_SEARCH_EXCERPT_CHARS = 150
|
_SEARCH_EXCERPT_CHARS = 150
|
||||||
|
|
||||||
|
|
||||||
|
async def session_read(date: str) -> str:
|
||||||
|
"""Read a full session log by date (YYYY-MM-DD).
|
||||||
|
|
||||||
|
Returns the complete session log for that date. If the date is not found,
|
||||||
|
lists the most recent available dates instead.
|
||||||
|
Only reads the current user's own sessions (per-persona isolation via ContextVars).
|
||||||
|
"""
|
||||||
|
return await asyncio.to_thread(_sync_session_read, date.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_session_read(date: str) -> str:
|
||||||
|
from persona import persona_path
|
||||||
|
sessions_dir = persona_path() / "sessions"
|
||||||
|
if not sessions_dir.exists():
|
||||||
|
return "No session logs found."
|
||||||
|
|
||||||
|
target = sessions_dir / f"{date}.md"
|
||||||
|
if target.exists():
|
||||||
|
content = target.read_text()
|
||||||
|
return f"Session log for {date} ({len(content)} chars):\n\n{content}"
|
||||||
|
|
||||||
|
available = sorted([f.stem for f in sessions_dir.glob("*.md")], reverse=True)
|
||||||
|
if not available:
|
||||||
|
return "No session logs found."
|
||||||
|
recent = "\n".join(f" {d}" for d in available[:15])
|
||||||
|
return f"No session log found for '{date}'. Available dates (most recent first):\n{recent}"
|
||||||
|
|
||||||
|
|
||||||
async def session_search(query: str, limit: int = 5) -> str:
|
async def session_search(query: str, limit: int = 5) -> str:
|
||||||
"""Search past session logs for a keyword or phrase.
|
"""Search past session logs for a keyword or phrase.
|
||||||
|
|
||||||
@@ -329,6 +357,22 @@ DECLARATIONS = [
|
|||||||
required=["path", "content"],
|
required=["path", "content"],
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
types.FunctionDeclaration(
|
||||||
|
name="session_read",
|
||||||
|
description=(
|
||||||
|
"Read a full session log by date (YYYY-MM-DD). Returns the complete conversation "
|
||||||
|
"from that session — useful for continuity, recalling decisions, or reviewing "
|
||||||
|
"what was discussed on a specific day. If the date is not found, lists available dates. "
|
||||||
|
"Only reads this user's own sessions."
|
||||||
|
),
|
||||||
|
parameters=types.Schema(
|
||||||
|
type=types.Type.OBJECT,
|
||||||
|
properties={
|
||||||
|
"date": types.Schema(type=types.Type.STRING, description="Date in YYYY-MM-DD format (e.g. '2026-05-08')"),
|
||||||
|
},
|
||||||
|
required=["date"],
|
||||||
|
),
|
||||||
|
),
|
||||||
types.FunctionDeclaration(
|
types.FunctionDeclaration(
|
||||||
name="session_search",
|
name="session_search",
|
||||||
description=(
|
description=(
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Web tools — search (DuckDuckGo) and direct HTTP fetch.
|
Web tools — search (DuckDuckGo), direct HTTP fetch, and clean content extraction.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
@@ -56,20 +56,25 @@ async def http_fetch(
|
|||||||
method: str = "GET",
|
method: str = "GET",
|
||||||
body: str | None = None,
|
body: str | None = None,
|
||||||
timeout: int = 15,
|
timeout: int = 15,
|
||||||
|
max_chars: int = 8192,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Fetch a URL directly and return the response body.
|
"""Fetch a URL directly and return the raw response body.
|
||||||
|
|
||||||
Unlike web_search, this hits a specific URL — useful for health checks,
|
Unlike web_search, this hits a specific URL — useful for health checks,
|
||||||
API probing, JSON endpoints, webhook testing, etc.
|
API probing, JSON endpoints, webhook testing, or reading raw page source.
|
||||||
Response body is capped at 8 KB.
|
For readable article content, use web_read instead.
|
||||||
|
Response body is capped at max_chars (default 8192, max 32768).
|
||||||
"""
|
"""
|
||||||
method = method.upper()
|
method = method.upper()
|
||||||
timeout = min(max(int(timeout), 1), 60)
|
timeout = min(max(int(timeout), 1), 60)
|
||||||
|
max_chars = min(max(int(max_chars), 100), 32768)
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||||
resp = await client.request(method, url, content=body)
|
resp = await client.request(method, url, content=body)
|
||||||
body_text = resp.text[:8192]
|
body_text = resp.text[:max_chars]
|
||||||
return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}"
|
truncated = len(resp.text) > max_chars
|
||||||
|
suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else ""
|
||||||
|
return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}"
|
||||||
except httpx.HTTPError as e:
|
except httpx.HTTPError as e:
|
||||||
return f"HTTP error: {e}"
|
return f"HTTP error: {e}"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -77,6 +82,39 @@ async def http_fetch(
|
|||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
|
||||||
|
async def web_read(url: str, max_chars: int = 16000) -> str:
|
||||||
|
"""Fetch a URL and extract clean readable text, stripping ads, navigation, and boilerplate.
|
||||||
|
|
||||||
|
Uses trafilatura to extract the main article content — ideal for blog posts,
|
||||||
|
documentation, news articles, and any page where you want the text without
|
||||||
|
surrounding noise. Returns markdown-formatted output.
|
||||||
|
For raw responses (JSON APIs, health checks), use http_fetch instead.
|
||||||
|
"""
|
||||||
|
max_chars = min(max(int(max_chars), 1000), 32000)
|
||||||
|
return await asyncio.to_thread(_sync_web_read, url, max_chars)
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_web_read(url: str, max_chars: int) -> str:
|
||||||
|
try:
|
||||||
|
import trafilatura
|
||||||
|
except ImportError:
|
||||||
|
return "web_read requires trafilatura — run: pip install trafilatura"
|
||||||
|
|
||||||
|
downloaded = trafilatura.fetch_url(url)
|
||||||
|
if downloaded is None:
|
||||||
|
return f"Failed to download content from: {url}"
|
||||||
|
|
||||||
|
text = trafilatura.extract(downloaded, output_format="markdown", include_links=True, url=url)
|
||||||
|
if not text:
|
||||||
|
text = trafilatura.extract(downloaded, url=url)
|
||||||
|
if not text:
|
||||||
|
return f"Could not extract readable content from: {url}"
|
||||||
|
|
||||||
|
if len(text) > max_chars:
|
||||||
|
text = text[:max_chars] + f"\n\n[… truncated at {max_chars} chars — pass a larger max_chars to see more]"
|
||||||
|
return f"Content from {url}:\n\n{text}"
|
||||||
|
|
||||||
|
|
||||||
DECLARATIONS = [
|
DECLARATIONS = [
|
||||||
types.FunctionDeclaration(
|
types.FunctionDeclaration(
|
||||||
name="web_search",
|
name="web_search",
|
||||||
@@ -96,10 +134,10 @@ DECLARATIONS = [
|
|||||||
types.FunctionDeclaration(
|
types.FunctionDeclaration(
|
||||||
name="http_fetch",
|
name="http_fetch",
|
||||||
description=(
|
description=(
|
||||||
"Fetch a specific URL and return the response. Unlike web_search, this hits "
|
"Fetch a specific URL and return the raw response body. Unlike web_search, this hits "
|
||||||
"a direct URL — useful for health checks, JSON API endpoints, webhook testing, "
|
"a direct URL — useful for health checks, JSON API endpoints, webhook testing, "
|
||||||
"or reading a specific page when you already know the URL. "
|
"or inspecting raw page source. For readable article/doc content, use web_read instead. "
|
||||||
"Response body is capped at 8 KB."
|
"Response body is capped at max_chars (default 8192, max 32768)."
|
||||||
),
|
),
|
||||||
parameters=types.Schema(
|
parameters=types.Schema(
|
||||||
type=types.Type.OBJECT,
|
type=types.Type.OBJECT,
|
||||||
@@ -108,6 +146,25 @@ DECLARATIONS = [
|
|||||||
"method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"),
|
"method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"),
|
||||||
"body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"),
|
"body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"),
|
||||||
"timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"),
|
"timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"),
|
||||||
|
"max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 8192, max 32768)"),
|
||||||
|
},
|
||||||
|
required=["url"],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
types.FunctionDeclaration(
|
||||||
|
name="web_read",
|
||||||
|
description=(
|
||||||
|
"Fetch a URL and extract clean readable text, stripping ads, navigation, sidebars, "
|
||||||
|
"and other boilerplate. Returns the main article/document content as markdown. "
|
||||||
|
"Use this for blog posts, documentation, news articles, GitHub READMEs, or any page "
|
||||||
|
"where you want the content without surrounding noise. "
|
||||||
|
"For raw HTTP responses (JSON APIs, health checks, source inspection), use http_fetch."
|
||||||
|
),
|
||||||
|
parameters=types.Schema(
|
||||||
|
type=types.Type.OBJECT,
|
||||||
|
properties={
|
||||||
|
"url": types.Schema(type=types.Type.STRING, description="Full URL to fetch and extract"),
|
||||||
|
"max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 16000, max 32000)"),
|
||||||
},
|
},
|
||||||
required=["url"],
|
required=["url"],
|
||||||
),
|
),
|
||||||
|
|||||||
Reference in New Issue
Block a user