feat: OpenAI-compatible orchestrator + backend auto-routing

- openai_orchestrator.py — new ReAct tool loop engine for any
  OpenAI-compatible endpoint (OpenRouter, Open WebUI, Ollama, LiteLLM);
  model handles both tool loop and final response, no Claude handoff needed
- tools/__init__.py — auto-derive OpenAI JSON Schema from existing Gemini
  FunctionDeclarations so tool definitions have a single source of truth
- routers/orchestrator.py — route to openai_orchestrator when model registry
  "orchestrator" role resolves to a local_openai type host
- routers/chat.py — pass role to _backend_label(); fix fallback_used logic
  (only meaningful for explicit backend overrides, not auto-routing)
- static/app.js — add null/"auto" to backend cycle; fetch local model hint
  without overriding the auto default on page load
- model_registry.py — _normalize() back-fills host_type on old registry files
- requirements.txt — add openai>=1.0.0
- ARCH__BACKENDS.md — document OpenAI-compat backend and routing logic

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Scott Idem
2026-04-08 19:18:18 -04:00
parent 8ba5247ef5
commit d9a322164a
9 changed files with 538 additions and 112 deletions

View File

@@ -122,13 +122,20 @@ def _empty() -> dict:
return {"version": 1, "hosts": [], "models": [], "roles": {}}
def _normalize(data: dict) -> dict:
"""Back-fill any missing fields introduced by schema additions."""
for h in data.get("hosts", []):
h.setdefault("host_type", "openwebui")
return data
def _load(username: str) -> dict:
path = _registry_path(username)
if path.exists():
try:
data = json.loads(path.read_text())
if isinstance(data, dict) and "version" in data:
return data
return _normalize(data)
except (json.JSONDecodeError, OSError):
logger.warning("model_registry.json for %s is unreadable — starting fresh", username)
return _empty()

View File

@@ -0,0 +1,196 @@
"""
OpenAI-compatible orchestrator engine.
Implements the same ReAct tool loop as orchestrator_engine.py but uses the
OpenAI tool calling format, which works with any OpenAI-compatible endpoint:
OpenRouter, LiteLLM, Open WebUI, Ollama (tool-capable models), etc.
The model both runs the tool loop AND writes the final user-facing response —
no separate handoff step needed when a single capable model handles everything.
Flow:
1. POST to {api_url}/chat/completions with tools + user message
2. If finish_reason == "tool_calls": execute tools, feed results back, repeat
3. If finish_reason == "stop": final assistant message is the user-facing response
Used when the "orchestrator" role in the model registry resolves to a local_openai
type model. The Gemini engine (orchestrator_engine.py) is used otherwise.
"""
import asyncio
import json
import logging
from openai import AsyncOpenAI
from config import settings
from orchestrator_engine import OrchestratorResult
from tools import OPENAI_TOOL_SCHEMAS, call_tool
logger = logging.getLogger(__name__)
# Appended to the persona system prompt so the model knows it has tools.
# Kept brief — capable models handle tool use without much coaching.
_TOOL_INSTRUCTION = (
"\n\nYou have access to tools. Use them when you need current information, "
"need to read files, or need to take actions on the user's behalf. "
"Respond naturally after gathering what you need."
)
async def run(
task: str,
system_prompt: str = "",
session_messages: list[dict] | None = None,
model_cfg: dict | None = None,
respond_with_final: bool = True,
) -> OrchestratorResult:
"""
Run a tool-enabled task using an OpenAI-compatible API.
Args:
task: The user's request (plain text)
system_prompt: Persona system prompt from context_loader (passed through)
session_messages: Recent conversation history for session continuity
model_cfg: Resolved model config from model_registry (local_openai type)
respond_with_final: If False, return just the tool-loop summary without a
full persona-voiced response (faster; for cron/background)
Returns:
OrchestratorResult — same shape as the Gemini engine for drop-in compatibility
"""
if not model_cfg:
raise RuntimeError("model_cfg is required for the OpenAI orchestrator")
api_url = model_cfg.get("api_url", "")
api_key = model_cfg.get("api_key", "") or "none"
model_name = model_cfg.get("model_name", "")
if not api_url or not model_name:
raise RuntimeError(
f"model_cfg missing api_url or model_name: {model_cfg.get('label', model_cfg)}"
)
client = AsyncOpenAI(base_url=api_url, api_key=api_key)
# System prompt: persona context + brief tool instruction
sys_content = (system_prompt or "") + _TOOL_INSTRUCTION
# Build messages: [system, ...recent_session, current_task]
messages: list[dict] = [{"role": "system", "content": sys_content}]
if session_messages:
messages.extend(session_messages[-6:]) # last 3 turns for context
messages.append({"role": "user", "content": task})
tool_call_log: list[dict] = []
final_response = ""
for round_num in range(settings.orchestrator_max_rounds):
logger.info("OpenAI orchestrator round %d / %d model=%s",
round_num + 1, settings.orchestrator_max_rounds, model_name)
response = await client.chat.completions.create(
model=model_name,
messages=messages,
tools=OPENAI_TOOL_SCHEMAS,
tool_choice="auto",
)
choice = response.choices[0]
msg = choice.message
# Append the assistant turn (MUST include tool_calls if present so the
# next request is valid — OpenAI requires the full history to be consistent)
assistant_msg: dict = {"role": "assistant"}
if msg.content:
assistant_msg["content"] = msg.content
if msg.tool_calls:
assistant_msg["tool_calls"] = [
{
"id": tc.id,
"type": "function",
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments,
},
}
for tc in msg.tool_calls
]
messages.append(assistant_msg)
if choice.finish_reason == "tool_calls" and msg.tool_calls:
# Execute all tool calls in parallel, then feed results back
tool_tasks = [
_execute_tool(tc.function.name, tc.function.arguments)
for tc in msg.tool_calls
]
results = await asyncio.gather(*tool_tasks, return_exceptions=True)
for tc, result in zip(msg.tool_calls, results):
result_str = (
str(result)
if not isinstance(result, Exception)
else f"Tool error: {result}"
)
logger.info("Tool %s%d chars", tc.function.name, len(result_str))
try:
args_parsed = json.loads(tc.function.arguments)
except json.JSONDecodeError:
args_parsed = {"raw": tc.function.arguments}
tool_call_log.append({
"tool": tc.function.name,
"args": args_parsed,
"result": result_str,
})
# Tool result message — tools array must be re-sent on every request
messages.append({
"role": "tool",
"tool_call_id": tc.id,
"content": result_str,
})
else:
# finish_reason == "stop" (or no tool_calls) — model is done
final_response = msg.content or ""
logger.info(
"OpenAI orchestrator done after %d round(s). Tools used: %d",
round_num + 1, len(tool_call_log),
)
break
else:
# Hit the round limit
logger.warning("OpenAI orchestrator hit max rounds (%d)", settings.orchestrator_max_rounds)
final_response = (
f"Reached the tool iteration limit ({settings.orchestrator_max_rounds} rounds). "
"Here is what was gathered:\n\n"
+ "\n\n".join(
f"**{t['tool']}**: {t['result'][:500]}" for t in tool_call_log
)
)
model_label = model_cfg.get("label") or model_name
logger.info("OpenAI orchestrator complete — model=%s tools=%d", model_label, len(tool_call_log))
return OrchestratorResult(
response=final_response,
tool_calls=tool_call_log,
backend="local",
gemini_summary=final_response, # reused for UI display; same content in single-model mode
)
async def _execute_tool(name: str, arguments_json: str) -> str:
"""Parse tool arguments and execute, returning a string result."""
try:
args = json.loads(arguments_json)
except json.JSONDecodeError:
args = {}
try:
return await call_tool(name, args)
except Exception as e:
logger.warning("Tool %s failed: %s", name, e)
return f"Tool error: {e}"

View File

@@ -19,5 +19,8 @@ python-multipart>=0.0.9 # required by FastAPI for Form() data
# Async HTTP client — used for local OpenAI-compatible backend (Open WebUI / Ollama)
httpx>=0.27.0
# OpenAI-compatible client — tool calling for OpenRouter / LiteLLM / any OAI-compat host
openai>=1.0.0
# anthropic SDK not needed — using claude CLI subprocess for auth
# anthropic>=0.40.0

View File

@@ -18,14 +18,14 @@ import event_bus
router = APIRouter()
def _backend_label(backend: str, username: str) -> str:
def _backend_label(backend: str, username: str, role: str = "chat") -> str:
"""Human-readable label for the model that handled a request."""
if backend == "claude":
return "Claude"
if backend == "gemini":
return "Gemini"
if backend == "local":
cfg = model_registry.get_best_local_model(username)
cfg = model_registry.get_best_local_model(username, role)
if cfg:
return cfg.get("label") or cfg.get("model_name") or "Local"
return "Local"
@@ -113,14 +113,16 @@ async def _stream_chat(req: ChatRequest):
if not req.off_record:
log_turn(session_id, req.message, response_text)
requested = req.model or settings.primary_backend
# fallback_used only makes sense for explicit backend selections.
# In auto mode (req.model is None), just report what responded.
fallback_used = bool(req.model and actual_backend != req.model)
payload = {
"type": "response",
"response": response_text,
"session_id": session_id,
"backend": actual_backend,
"backend_label": _backend_label(actual_backend, user),
"fallback_used": actual_backend != requested,
"backend_label": _backend_label(actual_backend, user, role="chat"),
"fallback_used": fallback_used,
}
yield f"data: {json.dumps(payload)}\n\n"

View File

@@ -22,7 +22,9 @@ from auth_utils import get_user_gemini_key
from config import settings
from context_loader import load_context
from persona import set_context, validate as validate_persona
import model_registry
import orchestrator_engine
import openai_orchestrator
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/orchestrate", tags=["orchestrator"])
@@ -157,13 +159,25 @@ async def _run_job(job_id: str, req: OrchestrateRequest, user: str) -> None:
history = load_session(session_id)
session_messages = history or None
result = await orchestrator_engine.run(
task=req.task,
system_prompt=system_prompt,
session_messages=session_messages,
respond_with_claude=req.respond_with_claude,
gemini_api_key=get_user_gemini_key(user),
)
# Choose engine based on the orchestrator role in the model registry
orch_model = model_registry.get_model_for_role(user, "orchestrator")
if orch_model and orch_model.get("type") == "local_openai":
result = await openai_orchestrator.run(
task=req.task,
system_prompt=system_prompt,
session_messages=session_messages,
model_cfg=orch_model,
respond_with_final=req.respond_with_claude,
)
else:
result = await orchestrator_engine.run(
task=req.task,
system_prompt=system_prompt,
session_messages=session_messages,
respond_with_claude=req.respond_with_claude,
gemini_api_key=get_user_gemini_key(user),
)
# Save the turn to the session store so it survives a page refresh
history.append({"role": "user", "content": req.task})

View File

@@ -84,7 +84,7 @@
if (helpLink) helpLink.href = `/help?persona=${encodeURIComponent(CORTEX_PERSONA)}`;
let sessionId = null;
let primaryBackend = 'claude';
let primaryBackend = null; // null = auto / role-based routing
let activeController = null;
let currentHistory = []; // mirrors backend session [{role, content}, ...]
let talkThinkingDiv = null; // pending "thinking…" bubble for live Talk updates
@@ -340,23 +340,30 @@
}
// ── Backend toggle ───────────────────────────────────────────
// null = "auto" — uses role-based routing from model registry
// 'claude' / 'gemini' / 'local' = explicit override
fetch('/backend').then(r => r.json()).then(d => setBackendUI(d));
// On load only fetch local_model hint; don't override primaryBackend default (null)
fetch('/backend').then(r => r.json()).then(d => {
if (backendModelHint && d.local_model) {
// Pre-fill hint in case user is already in local mode
backendModelHint.textContent = d.local_model.label || d.local_model.model_name;
}
});
const BACKEND_CYCLE = ['claude', 'gemini', 'local'];
const BACKEND_CYCLE = [null, 'claude', 'gemini', 'local'];
const BACKEND_CLASS = { claude: '', gemini: 'mem-on', local: 'local-on' };
const backendModelHint = document.getElementById('backend-model-hint');
function setBackendUI(d) {
const backend = d.primary || d; // accept full response obj or bare string
function setBackendUI(backend, localModel) {
primaryBackend = backend;
backendToggle.textContent = backend;
const extra = BACKEND_CLASS[backend] || '';
backendToggle.textContent = backend === null ? 'auto' : backend;
const extra = backend === null ? '' : (BACKEND_CLASS[backend] || '');
backendToggle.className = 'ctx-btn' + (extra ? ' ' + extra : '');
if (backendModelHint) {
if (backend === 'local' && d.local_model) {
backendModelHint.textContent = d.local_model.label || d.local_model.model_name;
if (backend === 'local' && localModel) {
backendModelHint.textContent = localModel.label || localModel.model_name;
backendModelHint.style.display = '';
} else {
backendModelHint.textContent = '';
@@ -365,17 +372,26 @@
}
}
// Initialize to auto mode
setBackendUI(null, null);
backendToggle.addEventListener('click', async () => {
const idx = BACKEND_CYCLE.indexOf(primaryBackend);
const next = BACKEND_CYCLE[(idx + 1) % BACKEND_CYCLE.length];
const res = await fetch('/backend', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ primary: next }),
});
const d = await res.json();
setBackendUI(d);
addMessage('system', `Backend: ${d.primary} (fallback: ${d.fallback})`);
if (next === null) {
// Auto: role-based routing — no server call needed
setBackendUI(null, null);
addMessage('system', 'Backend: auto (role-based routing)');
} else {
const res = await fetch('/backend', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ primary: next }),
});
const d = await res.json();
setBackendUI(next, d.local_model);
addMessage('system', `Backend: ${next} (fallback: ${d.fallback})`);
}
});
// ── Sessions panel ───────────────────────────────────────────
@@ -917,42 +933,15 @@
if (activeController) activeController.abort();
});
async function sendMessage() {
const text = inputEl.value.trim();
if (!text || activeController) return;
inputEl.value = '';
syncHeight();
sendBtn.style.display = 'none';
stopBtn.style.display = 'flex';
headerEmoji.classList.add('processing');
activeController = new AbortController();
const userHistIdx = currentHistory.length;
currentHistory.push({ role: 'user', content: text });
const userMsgDiv = addMessage('user', text);
attachHistoryControls(userMsgDiv, userHistIdx);
scrollToBottom();
const thinkingDiv = addMessage('assistant thinking', '✨ thinking…');
// ── Chat fetch + SSE handler ─────────────────────────────────
// Extracted so the retry button can call it without re-adding the
// user message to the DOM or currentHistory.
async function _doSend(payload, thinkingDiv) {
try {
const res = await fetch('/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
message: text,
session_id: sessionId,
tier: currentTier,
include_long: memLong,
include_mid: memMid,
include_short: memShort,
off_record: current_mode === 'otr',
model: primaryBackend,
user: CORTEX_USER,
persona: CORTEX_PERSONA,
}),
body: JSON.stringify(payload),
signal: activeController.signal,
});
@@ -1004,10 +993,77 @@
thinkingDiv.className = 'message system';
thinkingDiv.textContent = 'Stopped.';
} else {
// Show error + retry button
thinkingDiv.className = 'message error';
thinkingDiv.textContent = `Error: ${err.message}`;
thinkingDiv.innerHTML = '';
const errSpan = document.createElement('span');
errSpan.textContent = `Error: ${err.message}`;
thinkingDiv.appendChild(errSpan);
const retryBtn = document.createElement('button');
retryBtn.className = 'retry-btn';
retryBtn.textContent = '↺ Retry';
retryBtn.addEventListener('click', async () => {
// Roll back the failed user push, re-push, and try again
if (currentHistory.at(-1)?.role === 'user') currentHistory.pop();
currentHistory.push({ role: 'user', content: payload.message });
thinkingDiv.className = 'message assistant thinking';
thinkingDiv.textContent = '✨ thinking…';
activeController = new AbortController();
sendBtn.style.display = 'none';
stopBtn.style.display = 'flex';
headerEmoji.classList.add('processing');
await _doSend(payload, thinkingDiv);
activeController = null;
headerEmoji.classList.remove('processing');
sendBtn.style.display = 'block';
stopBtn.style.display = 'none';
inputEl.focus();
});
thinkingDiv.appendChild(retryBtn);
}
}
}
async function sendMessage() {
const text = inputEl.value.trim();
if (!text || activeController) return;
inputEl.value = '';
syncHeight();
sendBtn.style.display = 'none';
stopBtn.style.display = 'flex';
headerEmoji.classList.add('processing');
activeController = new AbortController();
const userHistIdx = currentHistory.length;
currentHistory.push({ role: 'user', content: text });
const userMsgDiv = addMessage('user', text);
attachHistoryControls(userMsgDiv, userHistIdx);
scrollToBottom();
const thinkingDiv = addMessage('assistant thinking', '✨ thinking…');
const payload = {
message: text,
session_id: sessionId,
tier: currentTier,
include_long: memLong,
include_mid: memMid,
include_short: memShort,
off_record: current_mode === 'otr',
model: primaryBackend,
user: CORTEX_USER,
persona: CORTEX_PERSONA,
};
await _doSend(payload, thinkingDiv);
activeController = null;
headerEmoji.classList.remove('processing');

View File

@@ -565,6 +565,26 @@
}
.model-tag.fallback { color: #f59e0b; }
/* Retry button — shown in error message bubbles */
.retry-btn {
display: inline-block;
margin-top: 0.6rem;
margin-left: 0.15rem;
padding: 0.25rem 0.7rem;
font-size: 0.78rem;
font-family: inherit;
background: transparent;
color: var(--error-text);
border: 1px solid var(--error-border);
border-radius: 4px;
cursor: pointer;
transition: background 0.15s, color 0.15s;
}
.retry-btn:hover {
background: var(--error-border);
color: #fff;
}
/* Note messages */
.message.note-private {
align-self: flex-end;

View File

@@ -551,3 +551,61 @@ async def call_tool(name: str, args: dict) -> str:
if fn is None:
return f"Unknown tool: {name}"
return await fn(**args)
# ---------------------------------------------------------------------------
# OpenAI JSON Schema format — auto-derived from the Gemini declarations above
# so there is a single source of truth for tool definitions.
# ---------------------------------------------------------------------------
_GEMINI_TYPE_TO_JSON = {
"OBJECT": "object",
"STRING": "string",
"INTEGER": "integer",
"NUMBER": "number",
"BOOLEAN": "boolean",
"ARRAY": "array",
}
def _schema_to_json(schema) -> dict:
"""Recursively convert a Gemini types.Schema to a JSON Schema dict."""
type_name = getattr(getattr(schema, "type", None), "name", "STRING")
result: dict = {"type": _GEMINI_TYPE_TO_JSON.get(type_name, "string")}
if getattr(schema, "description", None):
result["description"] = schema.description
props = getattr(schema, "properties", None) or {}
if result["type"] == "object":
result["properties"] = {k: _schema_to_json(v) for k, v in props.items()}
req = getattr(schema, "required", None)
if req:
result["required"] = list(req)
return result
def _build_openai_tools() -> list[dict]:
"""Convert TOOL_DECLARATIONS (Gemini format) to OpenAI tool schemas."""
out = []
for decl in TOOL_DECLARATIONS[0].function_declarations:
params = (
_schema_to_json(decl.parameters)
if decl.parameters
else {"type": "object", "properties": {}}
)
out.append({
"type": "function",
"function": {
"name": decl.name,
"description": decl.description or "",
"parameters": params,
},
})
return out
# OpenAI-format tool list — pass to client.chat.completions.create(tools=...)
OPENAI_TOOL_SCHEMAS: list[dict] = _build_openai_tools()