feat: OpenAI-compatible orchestrator + backend auto-routing
- openai_orchestrator.py — new ReAct tool loop engine for any OpenAI-compatible endpoint (OpenRouter, Open WebUI, Ollama, LiteLLM); model handles both tool loop and final response, no Claude handoff needed - tools/__init__.py — auto-derive OpenAI JSON Schema from existing Gemini FunctionDeclarations so tool definitions have a single source of truth - routers/orchestrator.py — route to openai_orchestrator when model registry "orchestrator" role resolves to a local_openai type host - routers/chat.py — pass role to _backend_label(); fix fallback_used logic (only meaningful for explicit backend overrides, not auto-routing) - static/app.js — add null/"auto" to backend cycle; fetch local model hint without overriding the auto default on page load - model_registry.py — _normalize() back-fills host_type on old registry files - requirements.txt — add openai>=1.0.0 - ARCH__BACKENDS.md — document OpenAI-compat backend and routing logic Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -122,13 +122,20 @@ def _empty() -> dict:
|
|||||||
return {"version": 1, "hosts": [], "models": [], "roles": {}}
|
return {"version": 1, "hosts": [], "models": [], "roles": {}}
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize(data: dict) -> dict:
|
||||||
|
"""Back-fill any missing fields introduced by schema additions."""
|
||||||
|
for h in data.get("hosts", []):
|
||||||
|
h.setdefault("host_type", "openwebui")
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _load(username: str) -> dict:
|
def _load(username: str) -> dict:
|
||||||
path = _registry_path(username)
|
path = _registry_path(username)
|
||||||
if path.exists():
|
if path.exists():
|
||||||
try:
|
try:
|
||||||
data = json.loads(path.read_text())
|
data = json.loads(path.read_text())
|
||||||
if isinstance(data, dict) and "version" in data:
|
if isinstance(data, dict) and "version" in data:
|
||||||
return data
|
return _normalize(data)
|
||||||
except (json.JSONDecodeError, OSError):
|
except (json.JSONDecodeError, OSError):
|
||||||
logger.warning("model_registry.json for %s is unreadable — starting fresh", username)
|
logger.warning("model_registry.json for %s is unreadable — starting fresh", username)
|
||||||
return _empty()
|
return _empty()
|
||||||
|
|||||||
196
cortex/openai_orchestrator.py
Normal file
196
cortex/openai_orchestrator.py
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
"""
|
||||||
|
OpenAI-compatible orchestrator engine.
|
||||||
|
|
||||||
|
Implements the same ReAct tool loop as orchestrator_engine.py but uses the
|
||||||
|
OpenAI tool calling format, which works with any OpenAI-compatible endpoint:
|
||||||
|
OpenRouter, LiteLLM, Open WebUI, Ollama (tool-capable models), etc.
|
||||||
|
|
||||||
|
The model both runs the tool loop AND writes the final user-facing response —
|
||||||
|
no separate handoff step needed when a single capable model handles everything.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. POST to {api_url}/chat/completions with tools + user message
|
||||||
|
2. If finish_reason == "tool_calls": execute tools, feed results back, repeat
|
||||||
|
3. If finish_reason == "stop": final assistant message is the user-facing response
|
||||||
|
|
||||||
|
Used when the "orchestrator" role in the model registry resolves to a local_openai
|
||||||
|
type model. The Gemini engine (orchestrator_engine.py) is used otherwise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
|
from config import settings
|
||||||
|
from orchestrator_engine import OrchestratorResult
|
||||||
|
from tools import OPENAI_TOOL_SCHEMAS, call_tool
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Appended to the persona system prompt so the model knows it has tools.
|
||||||
|
# Kept brief — capable models handle tool use without much coaching.
|
||||||
|
_TOOL_INSTRUCTION = (
|
||||||
|
"\n\nYou have access to tools. Use them when you need current information, "
|
||||||
|
"need to read files, or need to take actions on the user's behalf. "
|
||||||
|
"Respond naturally after gathering what you need."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def run(
|
||||||
|
task: str,
|
||||||
|
system_prompt: str = "",
|
||||||
|
session_messages: list[dict] | None = None,
|
||||||
|
model_cfg: dict | None = None,
|
||||||
|
respond_with_final: bool = True,
|
||||||
|
) -> OrchestratorResult:
|
||||||
|
"""
|
||||||
|
Run a tool-enabled task using an OpenAI-compatible API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task: The user's request (plain text)
|
||||||
|
system_prompt: Persona system prompt from context_loader (passed through)
|
||||||
|
session_messages: Recent conversation history for session continuity
|
||||||
|
model_cfg: Resolved model config from model_registry (local_openai type)
|
||||||
|
respond_with_final: If False, return just the tool-loop summary without a
|
||||||
|
full persona-voiced response (faster; for cron/background)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
OrchestratorResult — same shape as the Gemini engine for drop-in compatibility
|
||||||
|
"""
|
||||||
|
if not model_cfg:
|
||||||
|
raise RuntimeError("model_cfg is required for the OpenAI orchestrator")
|
||||||
|
|
||||||
|
api_url = model_cfg.get("api_url", "")
|
||||||
|
api_key = model_cfg.get("api_key", "") or "none"
|
||||||
|
model_name = model_cfg.get("model_name", "")
|
||||||
|
|
||||||
|
if not api_url or not model_name:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"model_cfg missing api_url or model_name: {model_cfg.get('label', model_cfg)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
client = AsyncOpenAI(base_url=api_url, api_key=api_key)
|
||||||
|
|
||||||
|
# System prompt: persona context + brief tool instruction
|
||||||
|
sys_content = (system_prompt or "") + _TOOL_INSTRUCTION
|
||||||
|
|
||||||
|
# Build messages: [system, ...recent_session, current_task]
|
||||||
|
messages: list[dict] = [{"role": "system", "content": sys_content}]
|
||||||
|
if session_messages:
|
||||||
|
messages.extend(session_messages[-6:]) # last 3 turns for context
|
||||||
|
messages.append({"role": "user", "content": task})
|
||||||
|
|
||||||
|
tool_call_log: list[dict] = []
|
||||||
|
final_response = ""
|
||||||
|
|
||||||
|
for round_num in range(settings.orchestrator_max_rounds):
|
||||||
|
logger.info("OpenAI orchestrator round %d / %d model=%s",
|
||||||
|
round_num + 1, settings.orchestrator_max_rounds, model_name)
|
||||||
|
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
tools=OPENAI_TOOL_SCHEMAS,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
choice = response.choices[0]
|
||||||
|
msg = choice.message
|
||||||
|
|
||||||
|
# Append the assistant turn (MUST include tool_calls if present so the
|
||||||
|
# next request is valid — OpenAI requires the full history to be consistent)
|
||||||
|
assistant_msg: dict = {"role": "assistant"}
|
||||||
|
if msg.content:
|
||||||
|
assistant_msg["content"] = msg.content
|
||||||
|
if msg.tool_calls:
|
||||||
|
assistant_msg["tool_calls"] = [
|
||||||
|
{
|
||||||
|
"id": tc.id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": tc.function.name,
|
||||||
|
"arguments": tc.function.arguments,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for tc in msg.tool_calls
|
||||||
|
]
|
||||||
|
messages.append(assistant_msg)
|
||||||
|
|
||||||
|
if choice.finish_reason == "tool_calls" and msg.tool_calls:
|
||||||
|
# Execute all tool calls in parallel, then feed results back
|
||||||
|
tool_tasks = [
|
||||||
|
_execute_tool(tc.function.name, tc.function.arguments)
|
||||||
|
for tc in msg.tool_calls
|
||||||
|
]
|
||||||
|
results = await asyncio.gather(*tool_tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
for tc, result in zip(msg.tool_calls, results):
|
||||||
|
result_str = (
|
||||||
|
str(result)
|
||||||
|
if not isinstance(result, Exception)
|
||||||
|
else f"Tool error: {result}"
|
||||||
|
)
|
||||||
|
logger.info("Tool %s → %d chars", tc.function.name, len(result_str))
|
||||||
|
|
||||||
|
try:
|
||||||
|
args_parsed = json.loads(tc.function.arguments)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
args_parsed = {"raw": tc.function.arguments}
|
||||||
|
|
||||||
|
tool_call_log.append({
|
||||||
|
"tool": tc.function.name,
|
||||||
|
"args": args_parsed,
|
||||||
|
"result": result_str,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Tool result message — tools array must be re-sent on every request
|
||||||
|
messages.append({
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": tc.id,
|
||||||
|
"content": result_str,
|
||||||
|
})
|
||||||
|
|
||||||
|
else:
|
||||||
|
# finish_reason == "stop" (or no tool_calls) — model is done
|
||||||
|
final_response = msg.content or ""
|
||||||
|
logger.info(
|
||||||
|
"OpenAI orchestrator done after %d round(s). Tools used: %d",
|
||||||
|
round_num + 1, len(tool_call_log),
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Hit the round limit
|
||||||
|
logger.warning("OpenAI orchestrator hit max rounds (%d)", settings.orchestrator_max_rounds)
|
||||||
|
final_response = (
|
||||||
|
f"Reached the tool iteration limit ({settings.orchestrator_max_rounds} rounds). "
|
||||||
|
"Here is what was gathered:\n\n"
|
||||||
|
+ "\n\n".join(
|
||||||
|
f"**{t['tool']}**: {t['result'][:500]}" for t in tool_call_log
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
model_label = model_cfg.get("label") or model_name
|
||||||
|
logger.info("OpenAI orchestrator complete — model=%s tools=%d", model_label, len(tool_call_log))
|
||||||
|
|
||||||
|
return OrchestratorResult(
|
||||||
|
response=final_response,
|
||||||
|
tool_calls=tool_call_log,
|
||||||
|
backend="local",
|
||||||
|
gemini_summary=final_response, # reused for UI display; same content in single-model mode
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _execute_tool(name: str, arguments_json: str) -> str:
|
||||||
|
"""Parse tool arguments and execute, returning a string result."""
|
||||||
|
try:
|
||||||
|
args = json.loads(arguments_json)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
args = {}
|
||||||
|
try:
|
||||||
|
return await call_tool(name, args)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Tool %s failed: %s", name, e)
|
||||||
|
return f"Tool error: {e}"
|
||||||
@@ -19,5 +19,8 @@ python-multipart>=0.0.9 # required by FastAPI for Form() data
|
|||||||
# Async HTTP client — used for local OpenAI-compatible backend (Open WebUI / Ollama)
|
# Async HTTP client — used for local OpenAI-compatible backend (Open WebUI / Ollama)
|
||||||
httpx>=0.27.0
|
httpx>=0.27.0
|
||||||
|
|
||||||
|
# OpenAI-compatible client — tool calling for OpenRouter / LiteLLM / any OAI-compat host
|
||||||
|
openai>=1.0.0
|
||||||
|
|
||||||
# anthropic SDK not needed — using claude CLI subprocess for auth
|
# anthropic SDK not needed — using claude CLI subprocess for auth
|
||||||
# anthropic>=0.40.0
|
# anthropic>=0.40.0
|
||||||
|
|||||||
@@ -18,14 +18,14 @@ import event_bus
|
|||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
def _backend_label(backend: str, username: str) -> str:
|
def _backend_label(backend: str, username: str, role: str = "chat") -> str:
|
||||||
"""Human-readable label for the model that handled a request."""
|
"""Human-readable label for the model that handled a request."""
|
||||||
if backend == "claude":
|
if backend == "claude":
|
||||||
return "Claude"
|
return "Claude"
|
||||||
if backend == "gemini":
|
if backend == "gemini":
|
||||||
return "Gemini"
|
return "Gemini"
|
||||||
if backend == "local":
|
if backend == "local":
|
||||||
cfg = model_registry.get_best_local_model(username)
|
cfg = model_registry.get_best_local_model(username, role)
|
||||||
if cfg:
|
if cfg:
|
||||||
return cfg.get("label") or cfg.get("model_name") or "Local"
|
return cfg.get("label") or cfg.get("model_name") or "Local"
|
||||||
return "Local"
|
return "Local"
|
||||||
@@ -113,14 +113,16 @@ async def _stream_chat(req: ChatRequest):
|
|||||||
if not req.off_record:
|
if not req.off_record:
|
||||||
log_turn(session_id, req.message, response_text)
|
log_turn(session_id, req.message, response_text)
|
||||||
|
|
||||||
requested = req.model or settings.primary_backend
|
# fallback_used only makes sense for explicit backend selections.
|
||||||
|
# In auto mode (req.model is None), just report what responded.
|
||||||
|
fallback_used = bool(req.model and actual_backend != req.model)
|
||||||
payload = {
|
payload = {
|
||||||
"type": "response",
|
"type": "response",
|
||||||
"response": response_text,
|
"response": response_text,
|
||||||
"session_id": session_id,
|
"session_id": session_id,
|
||||||
"backend": actual_backend,
|
"backend": actual_backend,
|
||||||
"backend_label": _backend_label(actual_backend, user),
|
"backend_label": _backend_label(actual_backend, user, role="chat"),
|
||||||
"fallback_used": actual_backend != requested,
|
"fallback_used": fallback_used,
|
||||||
}
|
}
|
||||||
yield f"data: {json.dumps(payload)}\n\n"
|
yield f"data: {json.dumps(payload)}\n\n"
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,9 @@ from auth_utils import get_user_gemini_key
|
|||||||
from config import settings
|
from config import settings
|
||||||
from context_loader import load_context
|
from context_loader import load_context
|
||||||
from persona import set_context, validate as validate_persona
|
from persona import set_context, validate as validate_persona
|
||||||
|
import model_registry
|
||||||
import orchestrator_engine
|
import orchestrator_engine
|
||||||
|
import openai_orchestrator
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
router = APIRouter(prefix="/orchestrate", tags=["orchestrator"])
|
router = APIRouter(prefix="/orchestrate", tags=["orchestrator"])
|
||||||
@@ -157,13 +159,25 @@ async def _run_job(job_id: str, req: OrchestrateRequest, user: str) -> None:
|
|||||||
history = load_session(session_id)
|
history = load_session(session_id)
|
||||||
session_messages = history or None
|
session_messages = history or None
|
||||||
|
|
||||||
result = await orchestrator_engine.run(
|
# Choose engine based on the orchestrator role in the model registry
|
||||||
task=req.task,
|
orch_model = model_registry.get_model_for_role(user, "orchestrator")
|
||||||
system_prompt=system_prompt,
|
|
||||||
session_messages=session_messages,
|
if orch_model and orch_model.get("type") == "local_openai":
|
||||||
respond_with_claude=req.respond_with_claude,
|
result = await openai_orchestrator.run(
|
||||||
gemini_api_key=get_user_gemini_key(user),
|
task=req.task,
|
||||||
)
|
system_prompt=system_prompt,
|
||||||
|
session_messages=session_messages,
|
||||||
|
model_cfg=orch_model,
|
||||||
|
respond_with_final=req.respond_with_claude,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
result = await orchestrator_engine.run(
|
||||||
|
task=req.task,
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
session_messages=session_messages,
|
||||||
|
respond_with_claude=req.respond_with_claude,
|
||||||
|
gemini_api_key=get_user_gemini_key(user),
|
||||||
|
)
|
||||||
|
|
||||||
# Save the turn to the session store so it survives a page refresh
|
# Save the turn to the session store so it survives a page refresh
|
||||||
history.append({"role": "user", "content": req.task})
|
history.append({"role": "user", "content": req.task})
|
||||||
|
|||||||
@@ -84,7 +84,7 @@
|
|||||||
if (helpLink) helpLink.href = `/help?persona=${encodeURIComponent(CORTEX_PERSONA)}`;
|
if (helpLink) helpLink.href = `/help?persona=${encodeURIComponent(CORTEX_PERSONA)}`;
|
||||||
|
|
||||||
let sessionId = null;
|
let sessionId = null;
|
||||||
let primaryBackend = 'claude';
|
let primaryBackend = null; // null = auto / role-based routing
|
||||||
let activeController = null;
|
let activeController = null;
|
||||||
let currentHistory = []; // mirrors backend session [{role, content}, ...]
|
let currentHistory = []; // mirrors backend session [{role, content}, ...]
|
||||||
let talkThinkingDiv = null; // pending "thinking…" bubble for live Talk updates
|
let talkThinkingDiv = null; // pending "thinking…" bubble for live Talk updates
|
||||||
@@ -340,23 +340,30 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ── Backend toggle ───────────────────────────────────────────
|
// ── Backend toggle ───────────────────────────────────────────
|
||||||
|
// null = "auto" — uses role-based routing from model registry
|
||||||
|
// 'claude' / 'gemini' / 'local' = explicit override
|
||||||
|
|
||||||
fetch('/backend').then(r => r.json()).then(d => setBackendUI(d));
|
// On load only fetch local_model hint; don't override primaryBackend default (null)
|
||||||
|
fetch('/backend').then(r => r.json()).then(d => {
|
||||||
|
if (backendModelHint && d.local_model) {
|
||||||
|
// Pre-fill hint in case user is already in local mode
|
||||||
|
backendModelHint.textContent = d.local_model.label || d.local_model.model_name;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
const BACKEND_CYCLE = ['claude', 'gemini', 'local'];
|
const BACKEND_CYCLE = [null, 'claude', 'gemini', 'local'];
|
||||||
const BACKEND_CLASS = { claude: '', gemini: 'mem-on', local: 'local-on' };
|
const BACKEND_CLASS = { claude: '', gemini: 'mem-on', local: 'local-on' };
|
||||||
const backendModelHint = document.getElementById('backend-model-hint');
|
const backendModelHint = document.getElementById('backend-model-hint');
|
||||||
|
|
||||||
function setBackendUI(d) {
|
function setBackendUI(backend, localModel) {
|
||||||
const backend = d.primary || d; // accept full response obj or bare string
|
|
||||||
primaryBackend = backend;
|
primaryBackend = backend;
|
||||||
backendToggle.textContent = backend;
|
backendToggle.textContent = backend === null ? 'auto' : backend;
|
||||||
const extra = BACKEND_CLASS[backend] || '';
|
const extra = backend === null ? '' : (BACKEND_CLASS[backend] || '');
|
||||||
backendToggle.className = 'ctx-btn' + (extra ? ' ' + extra : '');
|
backendToggle.className = 'ctx-btn' + (extra ? ' ' + extra : '');
|
||||||
|
|
||||||
if (backendModelHint) {
|
if (backendModelHint) {
|
||||||
if (backend === 'local' && d.local_model) {
|
if (backend === 'local' && localModel) {
|
||||||
backendModelHint.textContent = d.local_model.label || d.local_model.model_name;
|
backendModelHint.textContent = localModel.label || localModel.model_name;
|
||||||
backendModelHint.style.display = '';
|
backendModelHint.style.display = '';
|
||||||
} else {
|
} else {
|
||||||
backendModelHint.textContent = '';
|
backendModelHint.textContent = '';
|
||||||
@@ -365,17 +372,26 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize to auto mode
|
||||||
|
setBackendUI(null, null);
|
||||||
|
|
||||||
backendToggle.addEventListener('click', async () => {
|
backendToggle.addEventListener('click', async () => {
|
||||||
const idx = BACKEND_CYCLE.indexOf(primaryBackend);
|
const idx = BACKEND_CYCLE.indexOf(primaryBackend);
|
||||||
const next = BACKEND_CYCLE[(idx + 1) % BACKEND_CYCLE.length];
|
const next = BACKEND_CYCLE[(idx + 1) % BACKEND_CYCLE.length];
|
||||||
const res = await fetch('/backend', {
|
if (next === null) {
|
||||||
method: 'POST',
|
// Auto: role-based routing — no server call needed
|
||||||
headers: { 'Content-Type': 'application/json' },
|
setBackendUI(null, null);
|
||||||
body: JSON.stringify({ primary: next }),
|
addMessage('system', 'Backend: auto (role-based routing)');
|
||||||
});
|
} else {
|
||||||
const d = await res.json();
|
const res = await fetch('/backend', {
|
||||||
setBackendUI(d);
|
method: 'POST',
|
||||||
addMessage('system', `Backend: ${d.primary} (fallback: ${d.fallback})`);
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ primary: next }),
|
||||||
|
});
|
||||||
|
const d = await res.json();
|
||||||
|
setBackendUI(next, d.local_model);
|
||||||
|
addMessage('system', `Backend: ${next} (fallback: ${d.fallback})`);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// ── Sessions panel ───────────────────────────────────────────
|
// ── Sessions panel ───────────────────────────────────────────
|
||||||
@@ -917,42 +933,15 @@
|
|||||||
if (activeController) activeController.abort();
|
if (activeController) activeController.abort();
|
||||||
});
|
});
|
||||||
|
|
||||||
async function sendMessage() {
|
// ── Chat fetch + SSE handler ─────────────────────────────────
|
||||||
const text = inputEl.value.trim();
|
// Extracted so the retry button can call it without re-adding the
|
||||||
if (!text || activeController) return;
|
// user message to the DOM or currentHistory.
|
||||||
|
async function _doSend(payload, thinkingDiv) {
|
||||||
inputEl.value = '';
|
|
||||||
syncHeight();
|
|
||||||
sendBtn.style.display = 'none';
|
|
||||||
stopBtn.style.display = 'flex';
|
|
||||||
headerEmoji.classList.add('processing');
|
|
||||||
|
|
||||||
activeController = new AbortController();
|
|
||||||
|
|
||||||
const userHistIdx = currentHistory.length;
|
|
||||||
currentHistory.push({ role: 'user', content: text });
|
|
||||||
const userMsgDiv = addMessage('user', text);
|
|
||||||
attachHistoryControls(userMsgDiv, userHistIdx);
|
|
||||||
scrollToBottom();
|
|
||||||
|
|
||||||
const thinkingDiv = addMessage('assistant thinking', '✨ thinking…');
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch('/chat', {
|
const res = await fetch('/chat', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({
|
body: JSON.stringify(payload),
|
||||||
message: text,
|
|
||||||
session_id: sessionId,
|
|
||||||
tier: currentTier,
|
|
||||||
include_long: memLong,
|
|
||||||
include_mid: memMid,
|
|
||||||
include_short: memShort,
|
|
||||||
off_record: current_mode === 'otr',
|
|
||||||
model: primaryBackend,
|
|
||||||
user: CORTEX_USER,
|
|
||||||
persona: CORTEX_PERSONA,
|
|
||||||
}),
|
|
||||||
signal: activeController.signal,
|
signal: activeController.signal,
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1004,10 +993,77 @@
|
|||||||
thinkingDiv.className = 'message system';
|
thinkingDiv.className = 'message system';
|
||||||
thinkingDiv.textContent = 'Stopped.';
|
thinkingDiv.textContent = 'Stopped.';
|
||||||
} else {
|
} else {
|
||||||
|
// Show error + retry button
|
||||||
thinkingDiv.className = 'message error';
|
thinkingDiv.className = 'message error';
|
||||||
thinkingDiv.textContent = `Error: ${err.message}`;
|
thinkingDiv.innerHTML = '';
|
||||||
|
|
||||||
|
const errSpan = document.createElement('span');
|
||||||
|
errSpan.textContent = `Error: ${err.message}`;
|
||||||
|
thinkingDiv.appendChild(errSpan);
|
||||||
|
|
||||||
|
const retryBtn = document.createElement('button');
|
||||||
|
retryBtn.className = 'retry-btn';
|
||||||
|
retryBtn.textContent = '↺ Retry';
|
||||||
|
retryBtn.addEventListener('click', async () => {
|
||||||
|
// Roll back the failed user push, re-push, and try again
|
||||||
|
if (currentHistory.at(-1)?.role === 'user') currentHistory.pop();
|
||||||
|
currentHistory.push({ role: 'user', content: payload.message });
|
||||||
|
|
||||||
|
thinkingDiv.className = 'message assistant thinking';
|
||||||
|
thinkingDiv.textContent = '✨ thinking…';
|
||||||
|
|
||||||
|
activeController = new AbortController();
|
||||||
|
sendBtn.style.display = 'none';
|
||||||
|
stopBtn.style.display = 'flex';
|
||||||
|
headerEmoji.classList.add('processing');
|
||||||
|
|
||||||
|
await _doSend(payload, thinkingDiv);
|
||||||
|
|
||||||
|
activeController = null;
|
||||||
|
headerEmoji.classList.remove('processing');
|
||||||
|
sendBtn.style.display = 'block';
|
||||||
|
stopBtn.style.display = 'none';
|
||||||
|
inputEl.focus();
|
||||||
|
});
|
||||||
|
thinkingDiv.appendChild(retryBtn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sendMessage() {
|
||||||
|
const text = inputEl.value.trim();
|
||||||
|
if (!text || activeController) return;
|
||||||
|
|
||||||
|
inputEl.value = '';
|
||||||
|
syncHeight();
|
||||||
|
sendBtn.style.display = 'none';
|
||||||
|
stopBtn.style.display = 'flex';
|
||||||
|
headerEmoji.classList.add('processing');
|
||||||
|
|
||||||
|
activeController = new AbortController();
|
||||||
|
|
||||||
|
const userHistIdx = currentHistory.length;
|
||||||
|
currentHistory.push({ role: 'user', content: text });
|
||||||
|
const userMsgDiv = addMessage('user', text);
|
||||||
|
attachHistoryControls(userMsgDiv, userHistIdx);
|
||||||
|
scrollToBottom();
|
||||||
|
|
||||||
|
const thinkingDiv = addMessage('assistant thinking', '✨ thinking…');
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
message: text,
|
||||||
|
session_id: sessionId,
|
||||||
|
tier: currentTier,
|
||||||
|
include_long: memLong,
|
||||||
|
include_mid: memMid,
|
||||||
|
include_short: memShort,
|
||||||
|
off_record: current_mode === 'otr',
|
||||||
|
model: primaryBackend,
|
||||||
|
user: CORTEX_USER,
|
||||||
|
persona: CORTEX_PERSONA,
|
||||||
|
};
|
||||||
|
|
||||||
|
await _doSend(payload, thinkingDiv);
|
||||||
|
|
||||||
activeController = null;
|
activeController = null;
|
||||||
headerEmoji.classList.remove('processing');
|
headerEmoji.classList.remove('processing');
|
||||||
|
|||||||
@@ -565,6 +565,26 @@
|
|||||||
}
|
}
|
||||||
.model-tag.fallback { color: #f59e0b; }
|
.model-tag.fallback { color: #f59e0b; }
|
||||||
|
|
||||||
|
/* Retry button — shown in error message bubbles */
|
||||||
|
.retry-btn {
|
||||||
|
display: inline-block;
|
||||||
|
margin-top: 0.6rem;
|
||||||
|
margin-left: 0.15rem;
|
||||||
|
padding: 0.25rem 0.7rem;
|
||||||
|
font-size: 0.78rem;
|
||||||
|
font-family: inherit;
|
||||||
|
background: transparent;
|
||||||
|
color: var(--error-text);
|
||||||
|
border: 1px solid var(--error-border);
|
||||||
|
border-radius: 4px;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background 0.15s, color 0.15s;
|
||||||
|
}
|
||||||
|
.retry-btn:hover {
|
||||||
|
background: var(--error-border);
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
/* Note messages */
|
/* Note messages */
|
||||||
.message.note-private {
|
.message.note-private {
|
||||||
align-self: flex-end;
|
align-self: flex-end;
|
||||||
|
|||||||
@@ -551,3 +551,61 @@ async def call_tool(name: str, args: dict) -> str:
|
|||||||
if fn is None:
|
if fn is None:
|
||||||
return f"Unknown tool: {name}"
|
return f"Unknown tool: {name}"
|
||||||
return await fn(**args)
|
return await fn(**args)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# OpenAI JSON Schema format — auto-derived from the Gemini declarations above
|
||||||
|
# so there is a single source of truth for tool definitions.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_GEMINI_TYPE_TO_JSON = {
|
||||||
|
"OBJECT": "object",
|
||||||
|
"STRING": "string",
|
||||||
|
"INTEGER": "integer",
|
||||||
|
"NUMBER": "number",
|
||||||
|
"BOOLEAN": "boolean",
|
||||||
|
"ARRAY": "array",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _schema_to_json(schema) -> dict:
|
||||||
|
"""Recursively convert a Gemini types.Schema to a JSON Schema dict."""
|
||||||
|
type_name = getattr(getattr(schema, "type", None), "name", "STRING")
|
||||||
|
result: dict = {"type": _GEMINI_TYPE_TO_JSON.get(type_name, "string")}
|
||||||
|
|
||||||
|
if getattr(schema, "description", None):
|
||||||
|
result["description"] = schema.description
|
||||||
|
|
||||||
|
props = getattr(schema, "properties", None) or {}
|
||||||
|
if result["type"] == "object":
|
||||||
|
result["properties"] = {k: _schema_to_json(v) for k, v in props.items()}
|
||||||
|
|
||||||
|
req = getattr(schema, "required", None)
|
||||||
|
if req:
|
||||||
|
result["required"] = list(req)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _build_openai_tools() -> list[dict]:
|
||||||
|
"""Convert TOOL_DECLARATIONS (Gemini format) to OpenAI tool schemas."""
|
||||||
|
out = []
|
||||||
|
for decl in TOOL_DECLARATIONS[0].function_declarations:
|
||||||
|
params = (
|
||||||
|
_schema_to_json(decl.parameters)
|
||||||
|
if decl.parameters
|
||||||
|
else {"type": "object", "properties": {}}
|
||||||
|
)
|
||||||
|
out.append({
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": decl.name,
|
||||||
|
"description": decl.description or "",
|
||||||
|
"parameters": params,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# OpenAI-format tool list — pass to client.chat.completions.create(tools=...)
|
||||||
|
OPENAI_TOOL_SCHEMAS: list[dict] = _build_openai_tools()
|
||||||
|
|||||||
@@ -1,47 +1,130 @@
|
|||||||
# Architecture: LLM Backends
|
# Architecture: LLM Backends
|
||||||
|
|
||||||
> How Cortex talks to AI models.
|
> How Cortex selects and talks to AI models.
|
||||||
> Last updated: 2026-04-03
|
> Last updated: 2026-04-06
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Three Backends
|
## Backends
|
||||||
|
|
||||||
| Backend | Used for | Auth | Config |
|
| Backend | Type | Auth | Notes |
|
||||||
|---|---|---|---|
|
|---|---|---|---|
|
||||||
| **Claude CLI** | Primary chat, all user-facing responses | OAuth token from `~/.claude/.credentials.json` | `DEFAULT_MODEL` in `.env` |
|
| **Claude CLI** | `claude_cli` | OAuth token from `~/.claude/.credentials.json` | Primary chat; model set via `DEFAULT_MODEL` in `.env` |
|
||||||
| **Gemini CLI** | Fallback when Claude unavailable | Gemini CLI credentials | Auto-fallback |
|
| **Gemini CLI** | `gemini_cli` | Gemini CLI credentials | Fallback / explicit selection |
|
||||||
| **Local (Open WebUI)** | Private/offline tasks, cost-free use | API key per user in `local_llm.json` | `/settings/local` UI |
|
| **Gemini API** | `gemini_api` | `GEMINI_API_KEY` in `.env` | Orchestrator tool loop only — not general chat |
|
||||||
|
| **Local (OpenAI-compat)** | `local_openai` | API key per host in model registry | Open WebUI, Ollama, OpenRouter, LiteLLM, etc. |
|
||||||
The **Gemini API** (google-genai SDK) is also used — but only by the orchestrator tool loop, not as a general chat backend. See [`ARCH__FUTURE.md`](ARCH__FUTURE.md) for the orchestrator pattern.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Backend Selection
|
## Backend Selection
|
||||||
|
|
||||||
User toggles backend in the UI: `claude → gemini → local` (cycles). The active backend is stored server-side; the UI reflects it with color coding (default / green / amber).
|
### Default: Role-Based Routing (Auto)
|
||||||
|
|
||||||
When local is active, the active model name appears below the toggle button.
|
When no explicit backend is selected, Cortex routes to the model configured for the
|
||||||
|
request's **role** in the user's model registry. Roles: `chat`, `orchestrator`, `distill`,
|
||||||
|
`coder`, `research` (extensible via `DEFINED_ROLES` in `.env`).
|
||||||
|
|
||||||
**Fallback chain** (automatic, on error):
|
Resolution order for a role:
|
||||||
|
1. User registry: `roles[role].primary → backup_1 → backup_2 → backup_3 → backup_4`
|
||||||
|
2. `.env` role default: `ROLE_CHAT=claude_cli`, `ROLE_DISTILL=gemini_api`, etc.
|
||||||
|
3. Hardcoded last-resort: `chat/distill/coder → claude_cli`, `orchestrator/research → gemini_api`
|
||||||
|
|
||||||
|
### Explicit Override
|
||||||
|
|
||||||
|
The UI backend toggle cycles: **auto → claude → gemini → local → auto**
|
||||||
|
|
||||||
|
- **auto** (default): role-based routing as above; sends `model: null` to `/chat`
|
||||||
|
- **claude / gemini / local**: bypasses role routing; forces that specific backend
|
||||||
|
- When "local" is active, the configured model name appears below the toggle button
|
||||||
|
|
||||||
|
**Fallback chain** (automatic, on any error):
|
||||||
```
|
```
|
||||||
claude → gemini
|
claude → gemini
|
||||||
gemini → claude
|
gemini → claude
|
||||||
local → claude
|
local → claude
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Each response includes a model label (bottom-right of the message bubble) showing what
|
||||||
|
actually responded. Amber label with `⚡` = fallback was used.
|
||||||
|
|
||||||
Auth expiry on Claude triggers a UI banner + `claude_auth_expired` SSE event.
|
Auth expiry on Claude triggers a UI banner + `claude_auth_expired` SSE event.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Model Registry
|
||||||
|
|
||||||
|
Per-user configuration stored in `home/{user}/model_registry.json`.
|
||||||
|
|
||||||
|
Hosts and models are managed at **Settings → Model Registry** (`/settings/local`).
|
||||||
|
|
||||||
|
### Schema
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"hosts": [
|
||||||
|
{
|
||||||
|
"id": "abc123",
|
||||||
|
"label": "Home ML Laptop",
|
||||||
|
"api_url": "http://192.168.x.x:3000",
|
||||||
|
"api_key": "sk-...",
|
||||||
|
"host_type": "openwebui"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"models": [
|
||||||
|
{
|
||||||
|
"id": "def456",
|
||||||
|
"type": "local_openai",
|
||||||
|
"label": "Gemma Medium",
|
||||||
|
"model_name": "agent-support-gemma-medium",
|
||||||
|
"host_id": "abc123",
|
||||||
|
"context_k": 50,
|
||||||
|
"tags": ["chat", "fast"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"roles": {
|
||||||
|
"chat": {
|
||||||
|
"primary": "def456",
|
||||||
|
"backup_1": "claude_cli"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### host_type
|
||||||
|
|
||||||
|
Controls which API path layout is used:
|
||||||
|
|
||||||
|
| `host_type` | Chat endpoint | Models endpoint | Use for |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `openwebui` (default) | `POST {url}/api/chat/completions` | `GET {url}/api/models` | Open WebUI, Ollama |
|
||||||
|
| `openai` | `POST {url}/chat/completions` | `GET {url}/models` | OpenRouter, LiteLLM, Anthropic-compat |
|
||||||
|
|
||||||
|
Set `api_url` to the base path ending just before `/chat/completions`:
|
||||||
|
- OpenRouter: `https://openrouter.ai/api/v1`
|
||||||
|
- LiteLLM proxy: `http://host:port`
|
||||||
|
|
||||||
|
### Built-in model IDs
|
||||||
|
|
||||||
|
Always resolvable without a registry entry:
|
||||||
|
|
||||||
|
| ID | Backend |
|
||||||
|
|---|---|
|
||||||
|
| `claude_cli` | Claude CLI subprocess |
|
||||||
|
| `gemini_cli` | Gemini CLI subprocess |
|
||||||
|
| `gemini_api` | Gemini API (SDK) — orchestrator only |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Claude Backend (`_claude()`)
|
## Claude Backend (`_claude()`)
|
||||||
|
|
||||||
Runs `claude --print --no-session-persistence --output-format text` as a subprocess.
|
Runs `claude --print --no-session-persistence --output-format text` as a subprocess.
|
||||||
|
|
||||||
- System prompt passed via `--system-prompt`
|
- System prompt passed via `--system-prompt`
|
||||||
- Conversation history formatted as `<conversation>` block
|
- Conversation history formatted as `<conversation>` block
|
||||||
- Token read live from `~/.claude/.credentials.json` on every call — never relies on the env var, which goes stale after `claude auth login`
|
- Token read live from `~/.claude/.credentials.json` on every call — never relies on the
|
||||||
- Model override via `--model` flag (e.g. `claude-opus-4-6`)
|
env var, which goes stale after `claude auth login`
|
||||||
|
- Model override via `--model` flag when a specific `model_name` is configured in the registry
|
||||||
|
|
||||||
Timeout: `TIMEOUT_CLAUDE=60` seconds (`.env`)
|
Timeout: `TIMEOUT_CLAUDE=60` seconds (`.env`)
|
||||||
|
|
||||||
@@ -51,7 +134,7 @@ Timeout: `TIMEOUT_CLAUDE=60` seconds (`.env`)
|
|||||||
|
|
||||||
Runs `gemini --output-format text --extensions "" -p <prompt>` as a subprocess.
|
Runs `gemini --output-format text --extensions "" -p <prompt>` as a subprocess.
|
||||||
|
|
||||||
- `--extensions ""` disables all MCP extensions — prevents child processes from keeping pipes open after responding
|
- `--extensions ""` disables all MCP extensions — prevents child processes keeping pipes open
|
||||||
- `start_new_session=True` puts the process in its own group for clean `os.killpg` on timeout
|
- `start_new_session=True` puts the process in its own group for clean `os.killpg` on timeout
|
||||||
- Output is cleaned to strip CLI noise lines (loading messages, retry notices, quota warnings)
|
- Output is cleaned to strip CLI noise lines (loading messages, retry notices, quota warnings)
|
||||||
|
|
||||||
@@ -61,46 +144,33 @@ Timeout: `TIMEOUT_GEMINI=120` seconds (`.env`)
|
|||||||
|
|
||||||
## Local Backend (`_local()`)
|
## Local Backend (`_local()`)
|
||||||
|
|
||||||
HTTP POST to Open WebUI's OpenAI-compatible endpoint: `{api_url}/api/chat/completions`.
|
HTTP POST to an OpenAI-compatible endpoint. Model config is resolved via the model registry.
|
||||||
|
|
||||||
Per-user config in `home/{user}/local_llm.json`:
|
```python
|
||||||
```json
|
# host_type "openwebui": POST {api_url}/api/chat/completions
|
||||||
{
|
# host_type "openai": POST {api_url}/chat/completions
|
||||||
"hosts": [{"id": "...", "label": "scott_gaming", "api_url": "http://192.168.32.19:3000", "api_key": "sk-..."}],
|
|
||||||
"models": [{"id": "...", "host_id": "...", "label": "Gemma 4 Small", "model_name": "agent-support-gemma-small"}],
|
|
||||||
"active_model_id": "..."
|
|
||||||
}
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Resolution order for active model:
|
|
||||||
1. User's `active_model_id` in `local_llm.json`
|
|
||||||
2. `.env` server defaults (`LOCAL_API_URL` / `LOCAL_MODEL`)
|
|
||||||
3. Error — user is prompted to configure at `/settings/local`
|
|
||||||
|
|
||||||
Timeout: `TIMEOUT_LOCAL=300` seconds (`.env`) — local models may need to load from disk.
|
Timeout: `TIMEOUT_LOCAL=300` seconds (`.env`) — local models may need to load from disk.
|
||||||
|
|
||||||
**Manage at:** `/settings/local` — supports multiple hosts and models per user, "Fetch from host" button to populate model list from the server.
|
---
|
||||||
|
|
||||||
|
## Distillation
|
||||||
|
|
||||||
|
Memory distillation uses `role="distill"` for mid and long passes. Configure the distill
|
||||||
|
model via the Model Registry → Role Assignments → Distill role.
|
||||||
|
|
||||||
|
`.env` override: `ROLE_DISTILL=claude_cli` (default). Set to any built-in ID or leave blank
|
||||||
|
to fall through to the hardcoded default (`claude_cli`).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Distillation Backends
|
## Code locations
|
||||||
|
|
||||||
Memory distillation runs on a schedule and uses the LLM for mid and long distill passes. By default uses the primary backend (`claude`). Override in `.env`:
|
| File | Responsibility |
|
||||||
|
|---|---|
|
||||||
```
|
| `cortex/llm_client.py` | `complete()` — routing, dispatch, fallback |
|
||||||
DISTILL_BACKEND_MID=local # saves API credits — Gemma handles summarization well
|
| `cortex/model_registry.py` | Per-user registry CRUD and resolution |
|
||||||
DISTILL_BACKEND_LONG= # empty = use primary (claude recommended for quality)
|
| `cortex/routers/local_llm.py` | Settings UI routes + `/api/models/role` AJAX |
|
||||||
```
|
| `cortex/routers/chat.py` | `_backend_label()`, `fallback_used` flag |
|
||||||
|
| `cortex/config.py` | `ROLE_*` env defaults, `DEFINED_ROLES`, `PRIMARY_BACKEND` |
|
||||||
---
|
|
||||||
|
|
||||||
## Current Local Models (scott_gaming, 8 GB VRAM)
|
|
||||||
|
|
||||||
| Model | Alias | Speed | Practical Context |
|
|
||||||
|---|---|---|---|
|
|
||||||
| Gemma 4 E4B | `agent-support-gemma-small` | ~25 t/s | **72k tokens** |
|
|
||||||
| Gemma 4 26B A4B (MoE) | `agent-support-gemma-medium` | ~9 t/s | **50k tokens** |
|
|
||||||
|
|
||||||
Both support OpenAI `tools` / `tool_choice` function calling — required for the local orchestrator.
|
|
||||||
|
|
||||||
Full Open WebUI API reference: [`docs/OPEN_WEBUI_API.md`](../docs/OPEN_WEBUI_API.md)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user