diff --git a/cortex/requirements.txt b/cortex/requirements.txt index 816a287..fd2be4a 100644 --- a/cortex/requirements.txt +++ b/cortex/requirements.txt @@ -19,6 +19,9 @@ python-multipart>=0.0.9 # required by FastAPI for Form() data # Async HTTP client — used for local OpenAI-compatible backend (Open WebUI / Ollama) httpx>=0.27.0 +# Web content extraction — strips ads/nav/boilerplate, returns clean article text +trafilatura>=1.6.0 + # OpenAI-compatible client — tool calling for OpenRouter / LiteLLM / any OAI-compat host openai>=1.0.0 diff --git a/cortex/tools/__init__.py b/cortex/tools/__init__.py index aa9ac00..e4d1e0a 100644 --- a/cortex/tools/__init__.py +++ b/cortex/tools/__init__.py @@ -17,7 +17,7 @@ from google.genai import types # ── Callable imports ────────────────────────────────────────────────────────── -from tools.web import search as _web_search, http_fetch as _http_fetch +from tools.web import search as _web_search, http_fetch as _http_fetch, web_read as _web_read from tools.ae_knowledge import ( journal_list as _ae_journal_list, journal_search as _ae_journal_search, @@ -30,7 +30,7 @@ from tools.ae_knowledge import ( journal_entry_prepend as _ae_journal_entry_prepend, ) from tools.ae_tasks import task_list as _ae_task_list -from tools.files import file_read as _file_read, file_list as _file_list, file_write as _file_write, session_search as _session_search +from tools.files import file_read as _file_read, file_list as _file_list, file_write as _file_write, session_search as _session_search, session_read as _session_read from tools.system import ( shell_exec as _shell_exec, claude_allow_dir as _claude_allow_dir, @@ -90,8 +90,8 @@ import tools.agents as _mod_agents # ── Tool categories — used by the Model Registry UI for grouped checkboxes ─── TOOL_CATEGORIES: dict[str, list[str]] = { - "Web": ["web_search", "http_fetch"], - "Files": ["file_read", "file_list", "file_write", "session_search"], + "Web": ["web_search", "http_fetch", "web_read"], + "Files": ["file_read", "file_list", "file_write", "session_read", "session_search"], "Shell": ["shell_exec", "claude_allow_dir"], "System": ["cortex_restart", "cortex_logs", "cortex_status", "cortex_update"], "Tasks": ["task_list", "task_create", "task_update", "task_complete"], @@ -116,6 +116,7 @@ TOOL_CATEGORIES: dict[str, list[str]] = { _CALLABLES: dict[str, callable] = { "web_search": _web_search, "http_fetch": _http_fetch, + "web_read": _web_read, "ae_journal_list": _ae_journal_list, "ae_journal_search": _ae_journal_search, "ae_journal_entry_read": _ae_journal_entry_read, @@ -129,6 +130,7 @@ _CALLABLES: dict[str, callable] = { "file_read": _file_read, "file_list": _file_list, "file_write": _file_write, + "session_read": _session_read, "session_search": _session_search, "shell_exec": _shell_exec, "claude_allow_dir": _claude_allow_dir, diff --git a/cortex/tools/files.py b/cortex/tools/files.py index 198c613..6493ac1 100644 --- a/cortex/tools/files.py +++ b/cortex/tools/files.py @@ -230,6 +230,34 @@ def _sync_file_write(path: str, content: str, mode: str) -> str: _SEARCH_EXCERPT_CHARS = 150 +async def session_read(date: str) -> str: + """Read a full session log by date (YYYY-MM-DD). + + Returns the complete session log for that date. If the date is not found, + lists the most recent available dates instead. + Only reads the current user's own sessions (per-persona isolation via ContextVars). + """ + return await asyncio.to_thread(_sync_session_read, date.strip()) + + +def _sync_session_read(date: str) -> str: + from persona import persona_path + sessions_dir = persona_path() / "sessions" + if not sessions_dir.exists(): + return "No session logs found." + + target = sessions_dir / f"{date}.md" + if target.exists(): + content = target.read_text() + return f"Session log for {date} ({len(content)} chars):\n\n{content}" + + available = sorted([f.stem for f in sessions_dir.glob("*.md")], reverse=True) + if not available: + return "No session logs found." + recent = "\n".join(f" {d}" for d in available[:15]) + return f"No session log found for '{date}'. Available dates (most recent first):\n{recent}" + + async def session_search(query: str, limit: int = 5) -> str: """Search past session logs for a keyword or phrase. @@ -329,6 +357,22 @@ DECLARATIONS = [ required=["path", "content"], ), ), + types.FunctionDeclaration( + name="session_read", + description=( + "Read a full session log by date (YYYY-MM-DD). Returns the complete conversation " + "from that session — useful for continuity, recalling decisions, or reviewing " + "what was discussed on a specific day. If the date is not found, lists available dates. " + "Only reads this user's own sessions." + ), + parameters=types.Schema( + type=types.Type.OBJECT, + properties={ + "date": types.Schema(type=types.Type.STRING, description="Date in YYYY-MM-DD format (e.g. '2026-05-08')"), + }, + required=["date"], + ), + ), types.FunctionDeclaration( name="session_search", description=( diff --git a/cortex/tools/web.py b/cortex/tools/web.py index d9c78db..a553620 100644 --- a/cortex/tools/web.py +++ b/cortex/tools/web.py @@ -1,5 +1,5 @@ """ -Web tools — search (DuckDuckGo) and direct HTTP fetch. +Web tools — search (DuckDuckGo), direct HTTP fetch, and clean content extraction. """ import asyncio @@ -56,20 +56,25 @@ async def http_fetch( method: str = "GET", body: str | None = None, timeout: int = 15, + max_chars: int = 8192, ) -> str: - """Fetch a URL directly and return the response body. + """Fetch a URL directly and return the raw response body. Unlike web_search, this hits a specific URL — useful for health checks, - API probing, JSON endpoints, webhook testing, etc. - Response body is capped at 8 KB. + API probing, JSON endpoints, webhook testing, or reading raw page source. + For readable article content, use web_read instead. + Response body is capped at max_chars (default 8192, max 32768). """ method = method.upper() timeout = min(max(int(timeout), 1), 60) + max_chars = min(max(int(max_chars), 100), 32768) try: async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: resp = await client.request(method, url, content=body) - body_text = resp.text[:8192] - return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}" + body_text = resp.text[:max_chars] + truncated = len(resp.text) > max_chars + suffix = f"\n\n[… truncated at {max_chars} chars]" if truncated else "" + return f"HTTP {resp.status_code} {resp.url}\n\n{body_text}{suffix}" except httpx.HTTPError as e: return f"HTTP error: {e}" except Exception as e: @@ -77,6 +82,39 @@ async def http_fetch( return f"Error: {e}" +async def web_read(url: str, max_chars: int = 16000) -> str: + """Fetch a URL and extract clean readable text, stripping ads, navigation, and boilerplate. + + Uses trafilatura to extract the main article content — ideal for blog posts, + documentation, news articles, and any page where you want the text without + surrounding noise. Returns markdown-formatted output. + For raw responses (JSON APIs, health checks), use http_fetch instead. + """ + max_chars = min(max(int(max_chars), 1000), 32000) + return await asyncio.to_thread(_sync_web_read, url, max_chars) + + +def _sync_web_read(url: str, max_chars: int) -> str: + try: + import trafilatura + except ImportError: + return "web_read requires trafilatura — run: pip install trafilatura" + + downloaded = trafilatura.fetch_url(url) + if downloaded is None: + return f"Failed to download content from: {url}" + + text = trafilatura.extract(downloaded, output_format="markdown", include_links=True, url=url) + if not text: + text = trafilatura.extract(downloaded, url=url) + if not text: + return f"Could not extract readable content from: {url}" + + if len(text) > max_chars: + text = text[:max_chars] + f"\n\n[… truncated at {max_chars} chars — pass a larger max_chars to see more]" + return f"Content from {url}:\n\n{text}" + + DECLARATIONS = [ types.FunctionDeclaration( name="web_search", @@ -96,10 +134,10 @@ DECLARATIONS = [ types.FunctionDeclaration( name="http_fetch", description=( - "Fetch a specific URL and return the response. Unlike web_search, this hits " + "Fetch a specific URL and return the raw response body. Unlike web_search, this hits " "a direct URL — useful for health checks, JSON API endpoints, webhook testing, " - "or reading a specific page when you already know the URL. " - "Response body is capped at 8 KB." + "or inspecting raw page source. For readable article/doc content, use web_read instead. " + "Response body is capped at max_chars (default 8192, max 32768)." ), parameters=types.Schema( type=types.Type.OBJECT, @@ -108,6 +146,25 @@ DECLARATIONS = [ "method": types.Schema(type=types.Type.STRING, description="HTTP method: GET (default), POST, HEAD"), "body": types.Schema(type=types.Type.STRING, description="Optional request body (for POST requests)"), "timeout": types.Schema(type=types.Type.INTEGER, description="Request timeout in seconds (default 15, max 60)"), + "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 8192, max 32768)"), + }, + required=["url"], + ), + ), + types.FunctionDeclaration( + name="web_read", + description=( + "Fetch a URL and extract clean readable text, stripping ads, navigation, sidebars, " + "and other boilerplate. Returns the main article/document content as markdown. " + "Use this for blog posts, documentation, news articles, GitHub READMEs, or any page " + "where you want the content without surrounding noise. " + "For raw HTTP responses (JSON APIs, health checks, source inspection), use http_fetch." + ), + parameters=types.Schema( + type=types.Type.OBJECT, + properties={ + "url": types.Schema(type=types.Type.STRING, description="Full URL to fetch and extract"), + "max_chars": types.Schema(type=types.Type.INTEGER, description="Max characters to return (default 16000, max 32000)"), }, required=["url"], ),