diff --git a/cortex/model_registry.py b/cortex/model_registry.py index aa92a99..f2e7b02 100644 --- a/cortex/model_registry.py +++ b/cortex/model_registry.py @@ -122,13 +122,20 @@ def _empty() -> dict: return {"version": 1, "hosts": [], "models": [], "roles": {}} +def _normalize(data: dict) -> dict: + """Back-fill any missing fields introduced by schema additions.""" + for h in data.get("hosts", []): + h.setdefault("host_type", "openwebui") + return data + + def _load(username: str) -> dict: path = _registry_path(username) if path.exists(): try: data = json.loads(path.read_text()) if isinstance(data, dict) and "version" in data: - return data + return _normalize(data) except (json.JSONDecodeError, OSError): logger.warning("model_registry.json for %s is unreadable — starting fresh", username) return _empty() diff --git a/cortex/openai_orchestrator.py b/cortex/openai_orchestrator.py new file mode 100644 index 0000000..24c3280 --- /dev/null +++ b/cortex/openai_orchestrator.py @@ -0,0 +1,196 @@ +""" +OpenAI-compatible orchestrator engine. + +Implements the same ReAct tool loop as orchestrator_engine.py but uses the +OpenAI tool calling format, which works with any OpenAI-compatible endpoint: +OpenRouter, LiteLLM, Open WebUI, Ollama (tool-capable models), etc. + +The model both runs the tool loop AND writes the final user-facing response — +no separate handoff step needed when a single capable model handles everything. + +Flow: + 1. POST to {api_url}/chat/completions with tools + user message + 2. If finish_reason == "tool_calls": execute tools, feed results back, repeat + 3. If finish_reason == "stop": final assistant message is the user-facing response + +Used when the "orchestrator" role in the model registry resolves to a local_openai +type model. The Gemini engine (orchestrator_engine.py) is used otherwise. +""" + +import asyncio +import json +import logging + +from openai import AsyncOpenAI + +from config import settings +from orchestrator_engine import OrchestratorResult +from tools import OPENAI_TOOL_SCHEMAS, call_tool + +logger = logging.getLogger(__name__) + +# Appended to the persona system prompt so the model knows it has tools. +# Kept brief — capable models handle tool use without much coaching. +_TOOL_INSTRUCTION = ( + "\n\nYou have access to tools. Use them when you need current information, " + "need to read files, or need to take actions on the user's behalf. " + "Respond naturally after gathering what you need." +) + + +async def run( + task: str, + system_prompt: str = "", + session_messages: list[dict] | None = None, + model_cfg: dict | None = None, + respond_with_final: bool = True, +) -> OrchestratorResult: + """ + Run a tool-enabled task using an OpenAI-compatible API. + + Args: + task: The user's request (plain text) + system_prompt: Persona system prompt from context_loader (passed through) + session_messages: Recent conversation history for session continuity + model_cfg: Resolved model config from model_registry (local_openai type) + respond_with_final: If False, return just the tool-loop summary without a + full persona-voiced response (faster; for cron/background) + + Returns: + OrchestratorResult — same shape as the Gemini engine for drop-in compatibility + """ + if not model_cfg: + raise RuntimeError("model_cfg is required for the OpenAI orchestrator") + + api_url = model_cfg.get("api_url", "") + api_key = model_cfg.get("api_key", "") or "none" + model_name = model_cfg.get("model_name", "") + + if not api_url or not model_name: + raise RuntimeError( + f"model_cfg missing api_url or model_name: {model_cfg.get('label', model_cfg)}" + ) + + client = AsyncOpenAI(base_url=api_url, api_key=api_key) + + # System prompt: persona context + brief tool instruction + sys_content = (system_prompt or "") + _TOOL_INSTRUCTION + + # Build messages: [system, ...recent_session, current_task] + messages: list[dict] = [{"role": "system", "content": sys_content}] + if session_messages: + messages.extend(session_messages[-6:]) # last 3 turns for context + messages.append({"role": "user", "content": task}) + + tool_call_log: list[dict] = [] + final_response = "" + + for round_num in range(settings.orchestrator_max_rounds): + logger.info("OpenAI orchestrator round %d / %d model=%s", + round_num + 1, settings.orchestrator_max_rounds, model_name) + + response = await client.chat.completions.create( + model=model_name, + messages=messages, + tools=OPENAI_TOOL_SCHEMAS, + tool_choice="auto", + ) + + choice = response.choices[0] + msg = choice.message + + # Append the assistant turn (MUST include tool_calls if present so the + # next request is valid — OpenAI requires the full history to be consistent) + assistant_msg: dict = {"role": "assistant"} + if msg.content: + assistant_msg["content"] = msg.content + if msg.tool_calls: + assistant_msg["tool_calls"] = [ + { + "id": tc.id, + "type": "function", + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, + } + for tc in msg.tool_calls + ] + messages.append(assistant_msg) + + if choice.finish_reason == "tool_calls" and msg.tool_calls: + # Execute all tool calls in parallel, then feed results back + tool_tasks = [ + _execute_tool(tc.function.name, tc.function.arguments) + for tc in msg.tool_calls + ] + results = await asyncio.gather(*tool_tasks, return_exceptions=True) + + for tc, result in zip(msg.tool_calls, results): + result_str = ( + str(result) + if not isinstance(result, Exception) + else f"Tool error: {result}" + ) + logger.info("Tool %s → %d chars", tc.function.name, len(result_str)) + + try: + args_parsed = json.loads(tc.function.arguments) + except json.JSONDecodeError: + args_parsed = {"raw": tc.function.arguments} + + tool_call_log.append({ + "tool": tc.function.name, + "args": args_parsed, + "result": result_str, + }) + + # Tool result message — tools array must be re-sent on every request + messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": result_str, + }) + + else: + # finish_reason == "stop" (or no tool_calls) — model is done + final_response = msg.content or "" + logger.info( + "OpenAI orchestrator done after %d round(s). Tools used: %d", + round_num + 1, len(tool_call_log), + ) + break + + else: + # Hit the round limit + logger.warning("OpenAI orchestrator hit max rounds (%d)", settings.orchestrator_max_rounds) + final_response = ( + f"Reached the tool iteration limit ({settings.orchestrator_max_rounds} rounds). " + "Here is what was gathered:\n\n" + + "\n\n".join( + f"**{t['tool']}**: {t['result'][:500]}" for t in tool_call_log + ) + ) + + model_label = model_cfg.get("label") or model_name + logger.info("OpenAI orchestrator complete — model=%s tools=%d", model_label, len(tool_call_log)) + + return OrchestratorResult( + response=final_response, + tool_calls=tool_call_log, + backend="local", + gemini_summary=final_response, # reused for UI display; same content in single-model mode + ) + + +async def _execute_tool(name: str, arguments_json: str) -> str: + """Parse tool arguments and execute, returning a string result.""" + try: + args = json.loads(arguments_json) + except json.JSONDecodeError: + args = {} + try: + return await call_tool(name, args) + except Exception as e: + logger.warning("Tool %s failed: %s", name, e) + return f"Tool error: {e}" diff --git a/cortex/requirements.txt b/cortex/requirements.txt index dc7ec97..0350613 100644 --- a/cortex/requirements.txt +++ b/cortex/requirements.txt @@ -19,5 +19,8 @@ python-multipart>=0.0.9 # required by FastAPI for Form() data # Async HTTP client — used for local OpenAI-compatible backend (Open WebUI / Ollama) httpx>=0.27.0 +# OpenAI-compatible client — tool calling for OpenRouter / LiteLLM / any OAI-compat host +openai>=1.0.0 + # anthropic SDK not needed — using claude CLI subprocess for auth # anthropic>=0.40.0 diff --git a/cortex/routers/chat.py b/cortex/routers/chat.py index 6977db8..eb1608e 100644 --- a/cortex/routers/chat.py +++ b/cortex/routers/chat.py @@ -18,14 +18,14 @@ import event_bus router = APIRouter() -def _backend_label(backend: str, username: str) -> str: +def _backend_label(backend: str, username: str, role: str = "chat") -> str: """Human-readable label for the model that handled a request.""" if backend == "claude": return "Claude" if backend == "gemini": return "Gemini" if backend == "local": - cfg = model_registry.get_best_local_model(username) + cfg = model_registry.get_best_local_model(username, role) if cfg: return cfg.get("label") or cfg.get("model_name") or "Local" return "Local" @@ -113,14 +113,16 @@ async def _stream_chat(req: ChatRequest): if not req.off_record: log_turn(session_id, req.message, response_text) - requested = req.model or settings.primary_backend + # fallback_used only makes sense for explicit backend selections. + # In auto mode (req.model is None), just report what responded. + fallback_used = bool(req.model and actual_backend != req.model) payload = { "type": "response", "response": response_text, "session_id": session_id, "backend": actual_backend, - "backend_label": _backend_label(actual_backend, user), - "fallback_used": actual_backend != requested, + "backend_label": _backend_label(actual_backend, user, role="chat"), + "fallback_used": fallback_used, } yield f"data: {json.dumps(payload)}\n\n" diff --git a/cortex/routers/orchestrator.py b/cortex/routers/orchestrator.py index 330ccd3..82017c5 100644 --- a/cortex/routers/orchestrator.py +++ b/cortex/routers/orchestrator.py @@ -22,7 +22,9 @@ from auth_utils import get_user_gemini_key from config import settings from context_loader import load_context from persona import set_context, validate as validate_persona +import model_registry import orchestrator_engine +import openai_orchestrator logger = logging.getLogger(__name__) router = APIRouter(prefix="/orchestrate", tags=["orchestrator"]) @@ -157,13 +159,25 @@ async def _run_job(job_id: str, req: OrchestrateRequest, user: str) -> None: history = load_session(session_id) session_messages = history or None - result = await orchestrator_engine.run( - task=req.task, - system_prompt=system_prompt, - session_messages=session_messages, - respond_with_claude=req.respond_with_claude, - gemini_api_key=get_user_gemini_key(user), - ) + # Choose engine based on the orchestrator role in the model registry + orch_model = model_registry.get_model_for_role(user, "orchestrator") + + if orch_model and orch_model.get("type") == "local_openai": + result = await openai_orchestrator.run( + task=req.task, + system_prompt=system_prompt, + session_messages=session_messages, + model_cfg=orch_model, + respond_with_final=req.respond_with_claude, + ) + else: + result = await orchestrator_engine.run( + task=req.task, + system_prompt=system_prompt, + session_messages=session_messages, + respond_with_claude=req.respond_with_claude, + gemini_api_key=get_user_gemini_key(user), + ) # Save the turn to the session store so it survives a page refresh history.append({"role": "user", "content": req.task}) diff --git a/cortex/static/app.js b/cortex/static/app.js index 9960580..45cbc27 100644 --- a/cortex/static/app.js +++ b/cortex/static/app.js @@ -84,7 +84,7 @@ if (helpLink) helpLink.href = `/help?persona=${encodeURIComponent(CORTEX_PERSONA)}`; let sessionId = null; - let primaryBackend = 'claude'; + let primaryBackend = null; // null = auto / role-based routing let activeController = null; let currentHistory = []; // mirrors backend session [{role, content}, ...] let talkThinkingDiv = null; // pending "thinking…" bubble for live Talk updates @@ -340,23 +340,30 @@ } // ── Backend toggle ─────────────────────────────────────────── + // null = "auto" — uses role-based routing from model registry + // 'claude' / 'gemini' / 'local' = explicit override - fetch('/backend').then(r => r.json()).then(d => setBackendUI(d)); + // On load only fetch local_model hint; don't override primaryBackend default (null) + fetch('/backend').then(r => r.json()).then(d => { + if (backendModelHint && d.local_model) { + // Pre-fill hint in case user is already in local mode + backendModelHint.textContent = d.local_model.label || d.local_model.model_name; + } + }); - const BACKEND_CYCLE = ['claude', 'gemini', 'local']; + const BACKEND_CYCLE = [null, 'claude', 'gemini', 'local']; const BACKEND_CLASS = { claude: '', gemini: 'mem-on', local: 'local-on' }; const backendModelHint = document.getElementById('backend-model-hint'); - function setBackendUI(d) { - const backend = d.primary || d; // accept full response obj or bare string + function setBackendUI(backend, localModel) { primaryBackend = backend; - backendToggle.textContent = backend; - const extra = BACKEND_CLASS[backend] || ''; + backendToggle.textContent = backend === null ? 'auto' : backend; + const extra = backend === null ? '' : (BACKEND_CLASS[backend] || ''); backendToggle.className = 'ctx-btn' + (extra ? ' ' + extra : ''); if (backendModelHint) { - if (backend === 'local' && d.local_model) { - backendModelHint.textContent = d.local_model.label || d.local_model.model_name; + if (backend === 'local' && localModel) { + backendModelHint.textContent = localModel.label || localModel.model_name; backendModelHint.style.display = ''; } else { backendModelHint.textContent = ''; @@ -365,17 +372,26 @@ } } + // Initialize to auto mode + setBackendUI(null, null); + backendToggle.addEventListener('click', async () => { const idx = BACKEND_CYCLE.indexOf(primaryBackend); const next = BACKEND_CYCLE[(idx + 1) % BACKEND_CYCLE.length]; - const res = await fetch('/backend', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ primary: next }), - }); - const d = await res.json(); - setBackendUI(d); - addMessage('system', `Backend: ${d.primary} (fallback: ${d.fallback})`); + if (next === null) { + // Auto: role-based routing — no server call needed + setBackendUI(null, null); + addMessage('system', 'Backend: auto (role-based routing)'); + } else { + const res = await fetch('/backend', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ primary: next }), + }); + const d = await res.json(); + setBackendUI(next, d.local_model); + addMessage('system', `Backend: ${next} (fallback: ${d.fallback})`); + } }); // ── Sessions panel ─────────────────────────────────────────── @@ -917,42 +933,15 @@ if (activeController) activeController.abort(); }); - async function sendMessage() { - const text = inputEl.value.trim(); - if (!text || activeController) return; - - inputEl.value = ''; - syncHeight(); - sendBtn.style.display = 'none'; - stopBtn.style.display = 'flex'; - headerEmoji.classList.add('processing'); - - activeController = new AbortController(); - - const userHistIdx = currentHistory.length; - currentHistory.push({ role: 'user', content: text }); - const userMsgDiv = addMessage('user', text); - attachHistoryControls(userMsgDiv, userHistIdx); - scrollToBottom(); - - const thinkingDiv = addMessage('assistant thinking', '✨ thinking…'); - + // ── Chat fetch + SSE handler ───────────────────────────────── + // Extracted so the retry button can call it without re-adding the + // user message to the DOM or currentHistory. + async function _doSend(payload, thinkingDiv) { try { const res = await fetch('/chat', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - message: text, - session_id: sessionId, - tier: currentTier, - include_long: memLong, - include_mid: memMid, - include_short: memShort, - off_record: current_mode === 'otr', - model: primaryBackend, - user: CORTEX_USER, - persona: CORTEX_PERSONA, - }), + body: JSON.stringify(payload), signal: activeController.signal, }); @@ -1004,10 +993,77 @@ thinkingDiv.className = 'message system'; thinkingDiv.textContent = 'Stopped.'; } else { + // Show error + retry button thinkingDiv.className = 'message error'; - thinkingDiv.textContent = `Error: ${err.message}`; + thinkingDiv.innerHTML = ''; + + const errSpan = document.createElement('span'); + errSpan.textContent = `Error: ${err.message}`; + thinkingDiv.appendChild(errSpan); + + const retryBtn = document.createElement('button'); + retryBtn.className = 'retry-btn'; + retryBtn.textContent = '↺ Retry'; + retryBtn.addEventListener('click', async () => { + // Roll back the failed user push, re-push, and try again + if (currentHistory.at(-1)?.role === 'user') currentHistory.pop(); + currentHistory.push({ role: 'user', content: payload.message }); + + thinkingDiv.className = 'message assistant thinking'; + thinkingDiv.textContent = '✨ thinking…'; + + activeController = new AbortController(); + sendBtn.style.display = 'none'; + stopBtn.style.display = 'flex'; + headerEmoji.classList.add('processing'); + + await _doSend(payload, thinkingDiv); + + activeController = null; + headerEmoji.classList.remove('processing'); + sendBtn.style.display = 'block'; + stopBtn.style.display = 'none'; + inputEl.focus(); + }); + thinkingDiv.appendChild(retryBtn); } } + } + + async function sendMessage() { + const text = inputEl.value.trim(); + if (!text || activeController) return; + + inputEl.value = ''; + syncHeight(); + sendBtn.style.display = 'none'; + stopBtn.style.display = 'flex'; + headerEmoji.classList.add('processing'); + + activeController = new AbortController(); + + const userHistIdx = currentHistory.length; + currentHistory.push({ role: 'user', content: text }); + const userMsgDiv = addMessage('user', text); + attachHistoryControls(userMsgDiv, userHistIdx); + scrollToBottom(); + + const thinkingDiv = addMessage('assistant thinking', '✨ thinking…'); + + const payload = { + message: text, + session_id: sessionId, + tier: currentTier, + include_long: memLong, + include_mid: memMid, + include_short: memShort, + off_record: current_mode === 'otr', + model: primaryBackend, + user: CORTEX_USER, + persona: CORTEX_PERSONA, + }; + + await _doSend(payload, thinkingDiv); activeController = null; headerEmoji.classList.remove('processing'); diff --git a/cortex/static/style.css b/cortex/static/style.css index 4c6716d..d6ae675 100644 --- a/cortex/static/style.css +++ b/cortex/static/style.css @@ -565,6 +565,26 @@ } .model-tag.fallback { color: #f59e0b; } + /* Retry button — shown in error message bubbles */ + .retry-btn { + display: inline-block; + margin-top: 0.6rem; + margin-left: 0.15rem; + padding: 0.25rem 0.7rem; + font-size: 0.78rem; + font-family: inherit; + background: transparent; + color: var(--error-text); + border: 1px solid var(--error-border); + border-radius: 4px; + cursor: pointer; + transition: background 0.15s, color 0.15s; + } + .retry-btn:hover { + background: var(--error-border); + color: #fff; + } + /* Note messages */ .message.note-private { align-self: flex-end; diff --git a/cortex/tools/__init__.py b/cortex/tools/__init__.py index b400811..6677848 100644 --- a/cortex/tools/__init__.py +++ b/cortex/tools/__init__.py @@ -551,3 +551,61 @@ async def call_tool(name: str, args: dict) -> str: if fn is None: return f"Unknown tool: {name}" return await fn(**args) + + +# --------------------------------------------------------------------------- +# OpenAI JSON Schema format — auto-derived from the Gemini declarations above +# so there is a single source of truth for tool definitions. +# --------------------------------------------------------------------------- + +_GEMINI_TYPE_TO_JSON = { + "OBJECT": "object", + "STRING": "string", + "INTEGER": "integer", + "NUMBER": "number", + "BOOLEAN": "boolean", + "ARRAY": "array", +} + + +def _schema_to_json(schema) -> dict: + """Recursively convert a Gemini types.Schema to a JSON Schema dict.""" + type_name = getattr(getattr(schema, "type", None), "name", "STRING") + result: dict = {"type": _GEMINI_TYPE_TO_JSON.get(type_name, "string")} + + if getattr(schema, "description", None): + result["description"] = schema.description + + props = getattr(schema, "properties", None) or {} + if result["type"] == "object": + result["properties"] = {k: _schema_to_json(v) for k, v in props.items()} + + req = getattr(schema, "required", None) + if req: + result["required"] = list(req) + + return result + + +def _build_openai_tools() -> list[dict]: + """Convert TOOL_DECLARATIONS (Gemini format) to OpenAI tool schemas.""" + out = [] + for decl in TOOL_DECLARATIONS[0].function_declarations: + params = ( + _schema_to_json(decl.parameters) + if decl.parameters + else {"type": "object", "properties": {}} + ) + out.append({ + "type": "function", + "function": { + "name": decl.name, + "description": decl.description or "", + "parameters": params, + }, + }) + return out + + +# OpenAI-format tool list — pass to client.chat.completions.create(tools=...) +OPENAI_TOOL_SCHEMAS: list[dict] = _build_openai_tools() diff --git a/documentation/ARCH__BACKENDS.md b/documentation/ARCH__BACKENDS.md index 0b549c5..e8e3b54 100644 --- a/documentation/ARCH__BACKENDS.md +++ b/documentation/ARCH__BACKENDS.md @@ -1,47 +1,130 @@ # Architecture: LLM Backends -> How Cortex talks to AI models. -> Last updated: 2026-04-03 +> How Cortex selects and talks to AI models. +> Last updated: 2026-04-06 --- -## Three Backends +## Backends -| Backend | Used for | Auth | Config | +| Backend | Type | Auth | Notes | |---|---|---|---| -| **Claude CLI** | Primary chat, all user-facing responses | OAuth token from `~/.claude/.credentials.json` | `DEFAULT_MODEL` in `.env` | -| **Gemini CLI** | Fallback when Claude unavailable | Gemini CLI credentials | Auto-fallback | -| **Local (Open WebUI)** | Private/offline tasks, cost-free use | API key per user in `local_llm.json` | `/settings/local` UI | - -The **Gemini API** (google-genai SDK) is also used — but only by the orchestrator tool loop, not as a general chat backend. See [`ARCH__FUTURE.md`](ARCH__FUTURE.md) for the orchestrator pattern. +| **Claude CLI** | `claude_cli` | OAuth token from `~/.claude/.credentials.json` | Primary chat; model set via `DEFAULT_MODEL` in `.env` | +| **Gemini CLI** | `gemini_cli` | Gemini CLI credentials | Fallback / explicit selection | +| **Gemini API** | `gemini_api` | `GEMINI_API_KEY` in `.env` | Orchestrator tool loop only — not general chat | +| **Local (OpenAI-compat)** | `local_openai` | API key per host in model registry | Open WebUI, Ollama, OpenRouter, LiteLLM, etc. | --- ## Backend Selection -User toggles backend in the UI: `claude → gemini → local` (cycles). The active backend is stored server-side; the UI reflects it with color coding (default / green / amber). +### Default: Role-Based Routing (Auto) -When local is active, the active model name appears below the toggle button. +When no explicit backend is selected, Cortex routes to the model configured for the +request's **role** in the user's model registry. Roles: `chat`, `orchestrator`, `distill`, +`coder`, `research` (extensible via `DEFINED_ROLES` in `.env`). -**Fallback chain** (automatic, on error): +Resolution order for a role: +1. User registry: `roles[role].primary → backup_1 → backup_2 → backup_3 → backup_4` +2. `.env` role default: `ROLE_CHAT=claude_cli`, `ROLE_DISTILL=gemini_api`, etc. +3. Hardcoded last-resort: `chat/distill/coder → claude_cli`, `orchestrator/research → gemini_api` + +### Explicit Override + +The UI backend toggle cycles: **auto → claude → gemini → local → auto** + +- **auto** (default): role-based routing as above; sends `model: null` to `/chat` +- **claude / gemini / local**: bypasses role routing; forces that specific backend +- When "local" is active, the configured model name appears below the toggle button + +**Fallback chain** (automatic, on any error): ``` claude → gemini gemini → claude local → claude ``` +Each response includes a model label (bottom-right of the message bubble) showing what +actually responded. Amber label with `⚡` = fallback was used. + Auth expiry on Claude triggers a UI banner + `claude_auth_expired` SSE event. --- +## Model Registry + +Per-user configuration stored in `home/{user}/model_registry.json`. + +Hosts and models are managed at **Settings → Model Registry** (`/settings/local`). + +### Schema + +```json +{ + "version": 1, + "hosts": [ + { + "id": "abc123", + "label": "Home ML Laptop", + "api_url": "http://192.168.x.x:3000", + "api_key": "sk-...", + "host_type": "openwebui" + } + ], + "models": [ + { + "id": "def456", + "type": "local_openai", + "label": "Gemma Medium", + "model_name": "agent-support-gemma-medium", + "host_id": "abc123", + "context_k": 50, + "tags": ["chat", "fast"] + } + ], + "roles": { + "chat": { + "primary": "def456", + "backup_1": "claude_cli" + } + } +} +``` + +### host_type + +Controls which API path layout is used: + +| `host_type` | Chat endpoint | Models endpoint | Use for | +|---|---|---|---| +| `openwebui` (default) | `POST {url}/api/chat/completions` | `GET {url}/api/models` | Open WebUI, Ollama | +| `openai` | `POST {url}/chat/completions` | `GET {url}/models` | OpenRouter, LiteLLM, Anthropic-compat | + +Set `api_url` to the base path ending just before `/chat/completions`: +- OpenRouter: `https://openrouter.ai/api/v1` +- LiteLLM proxy: `http://host:port` + +### Built-in model IDs + +Always resolvable without a registry entry: + +| ID | Backend | +|---|---| +| `claude_cli` | Claude CLI subprocess | +| `gemini_cli` | Gemini CLI subprocess | +| `gemini_api` | Gemini API (SDK) — orchestrator only | + +--- + ## Claude Backend (`_claude()`) Runs `claude --print --no-session-persistence --output-format text` as a subprocess. - System prompt passed via `--system-prompt` - Conversation history formatted as `` block -- Token read live from `~/.claude/.credentials.json` on every call — never relies on the env var, which goes stale after `claude auth login` -- Model override via `--model` flag (e.g. `claude-opus-4-6`) +- Token read live from `~/.claude/.credentials.json` on every call — never relies on the + env var, which goes stale after `claude auth login` +- Model override via `--model` flag when a specific `model_name` is configured in the registry Timeout: `TIMEOUT_CLAUDE=60` seconds (`.env`) @@ -51,7 +134,7 @@ Timeout: `TIMEOUT_CLAUDE=60` seconds (`.env`) Runs `gemini --output-format text --extensions "" -p ` as a subprocess. -- `--extensions ""` disables all MCP extensions — prevents child processes from keeping pipes open after responding +- `--extensions ""` disables all MCP extensions — prevents child processes keeping pipes open - `start_new_session=True` puts the process in its own group for clean `os.killpg` on timeout - Output is cleaned to strip CLI noise lines (loading messages, retry notices, quota warnings) @@ -61,46 +144,33 @@ Timeout: `TIMEOUT_GEMINI=120` seconds (`.env`) ## Local Backend (`_local()`) -HTTP POST to Open WebUI's OpenAI-compatible endpoint: `{api_url}/api/chat/completions`. +HTTP POST to an OpenAI-compatible endpoint. Model config is resolved via the model registry. -Per-user config in `home/{user}/local_llm.json`: -```json -{ - "hosts": [{"id": "...", "label": "scott_gaming", "api_url": "http://192.168.32.19:3000", "api_key": "sk-..."}], - "models": [{"id": "...", "host_id": "...", "label": "Gemma 4 Small", "model_name": "agent-support-gemma-small"}], - "active_model_id": "..." -} +```python +# host_type "openwebui": POST {api_url}/api/chat/completions +# host_type "openai": POST {api_url}/chat/completions ``` -Resolution order for active model: -1. User's `active_model_id` in `local_llm.json` -2. `.env` server defaults (`LOCAL_API_URL` / `LOCAL_MODEL`) -3. Error — user is prompted to configure at `/settings/local` - Timeout: `TIMEOUT_LOCAL=300` seconds (`.env`) — local models may need to load from disk. -**Manage at:** `/settings/local` — supports multiple hosts and models per user, "Fetch from host" button to populate model list from the server. +--- + +## Distillation + +Memory distillation uses `role="distill"` for mid and long passes. Configure the distill +model via the Model Registry → Role Assignments → Distill role. + +`.env` override: `ROLE_DISTILL=claude_cli` (default). Set to any built-in ID or leave blank +to fall through to the hardcoded default (`claude_cli`). --- -## Distillation Backends +## Code locations -Memory distillation runs on a schedule and uses the LLM for mid and long distill passes. By default uses the primary backend (`claude`). Override in `.env`: - -``` -DISTILL_BACKEND_MID=local # saves API credits — Gemma handles summarization well -DISTILL_BACKEND_LONG= # empty = use primary (claude recommended for quality) -``` - ---- - -## Current Local Models (scott_gaming, 8 GB VRAM) - -| Model | Alias | Speed | Practical Context | -|---|---|---|---| -| Gemma 4 E4B | `agent-support-gemma-small` | ~25 t/s | **72k tokens** | -| Gemma 4 26B A4B (MoE) | `agent-support-gemma-medium` | ~9 t/s | **50k tokens** | - -Both support OpenAI `tools` / `tool_choice` function calling — required for the local orchestrator. - -Full Open WebUI API reference: [`docs/OPEN_WEBUI_API.md`](../docs/OPEN_WEBUI_API.md) +| File | Responsibility | +|---|---| +| `cortex/llm_client.py` | `complete()` — routing, dispatch, fallback | +| `cortex/model_registry.py` | Per-user registry CRUD and resolution | +| `cortex/routers/local_llm.py` | Settings UI routes + `/api/models/role` AJAX | +| `cortex/routers/chat.py` | `_backend_label()`, `fallback_used` flag | +| `cortex/config.py` | `ROLE_*` env defaults, `DEFINED_ROLES`, `PRIMARY_BACKEND` |