feat: token streaming for orchestrator final response

Switches the orchestrator's final response from a fire-and-wait model to a live SSE stream so text appears token-by-token as the model generates it. - llm_client: complete() gains token_sink param; anthropic_api backend uses client.messages.stream(); local backend uses httpx SSE streaming; non-streaming backends (claude_cli, gemini_cli) emit the full text as one chunk - orchestrator_engine + openai_orchestrator: token_sink threaded through run(), _run_from_contents(), _claude_handoff(), and _run_from_messages() - routers/orchestrator: each job gets an asyncio.Queue; _on_progress and _token_sink write progress/token events to it; _finalize_job emits done, error handler emits error, confirmation gate emits confirm; new GET /orchestrate/{job_id}/stream SSE endpoint with 20s keepalive - app.js: _doOrchestrate switches from 2s poll loop to EventSource; thinking bubble converts to a streaming message on first token; auto-scroll while streaming; confirm/error/done events handled; finalization unchanged Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-16 23:22:50 -04:00
parent c31eba111f
commit 9cb2b0d9a5
6 changed files with 293 additions and 63 deletions
--- a/cortex/llm_client.py
+++ b/cortex/llm_client.py
@@ -53,6 +53,7 @@ async def complete(
    slot: str | None = None,
    max_tokens: int = 2048,
    attachment: dict | None = None,
+    token_sink=None,  # async (str) -> None; if set, stream tokens as they arrive
 ) -> tuple[str, str]:
    """
    Returns (response_text, actual_backend_used).
@@ -98,7 +99,8 @@ async def complete(
    fallback = _FALLBACK.get(primary, "claude")

    try:
-        response = await _dispatch(primary, system_prompt, messages, resolved_cfg, attachment=attachment)
+        response = await _dispatch(primary, system_prompt, messages, resolved_cfg,
+                                   attachment=attachment, token_sink=token_sink)
        return response, primary
    except Exception as e:
        err_str = str(e)
@@ -109,7 +111,7 @@ async def complete(
            logger.error("%s failed (no fallback — model explicitly configured): %s", primary, e)
            raise
        logger.warning("%s failed (%s) — falling back to %s", primary, e, fallback)
-        response = await _dispatch(fallback, system_prompt, messages, None)
+        response = await _dispatch(fallback, system_prompt, messages, None, token_sink=token_sink)
        return response, fallback


@@ -119,14 +121,24 @@ async def _dispatch(
    messages: list[dict],
    model_cfg: dict | None,
    attachment: dict | None = None,
+    token_sink=None,
 ) -> str:
    if backend == "gemini":
-        return await _gemini(system_prompt, messages)
-    if backend == "local":
-        return await _local(system_prompt, messages, model_cfg, attachment=attachment)
-    if backend == "anthropic_api":
-        return await _anthropic_api(system_prompt, messages, model_cfg)
-    return await _claude(system_prompt, messages, model_cfg)
+        text = await _gemini(system_prompt, messages)
+    elif backend == "local":
+        if token_sink:
+            return await _local_streaming(token_sink, system_prompt, messages, model_cfg)
+        text = await _local(system_prompt, messages, model_cfg, attachment=attachment)
+    elif backend == "anthropic_api":
+        if token_sink:
+            return await _anthropic_api_streaming(token_sink, system_prompt, messages, model_cfg)
+        text = await _anthropic_api(system_prompt, messages, model_cfg)
+    else:
+        text = await _claude(system_prompt, messages, model_cfg)
+    # For non-streaming backends when token_sink is provided, emit the full text as one chunk.
+    if token_sink and text:
+        await token_sink(text)
+    return text


 def _fresh_claude_token() -> str | None:
@@ -302,6 +314,99 @@ async def _anthropic_api(system_prompt: str, messages: list[dict], model_cfg: di
    return text.strip()


+async def _anthropic_api_streaming(
+    token_sink, system_prompt: str, messages: list[dict], model_cfg: dict | None
+) -> str:
+    try:
+        import anthropic
+    except ImportError:
+        raise RuntimeError("anthropic SDK not installed — run: pip install 'anthropic>=0.40.0'")
+
+    cfg        = model_cfg or {}
+    api_key    = cfg.get("api_key", "")
+    model_name = cfg.get("model_name") or settings.default_model
+
+    if not api_key:
+        raise RuntimeError("No Anthropic API key — add one at /settings/models")
+
+    client = anthropic.AsyncAnthropic(api_key=api_key)
+    msgs   = [{"role": m["role"], "content": m["content"]} for m in messages]
+    kwargs: dict = {"model": model_name, "max_tokens": 4096, "messages": msgs}
+    if system_prompt:
+        kwargs["system"] = system_prompt
+
+    full_text = ""
+    async with client.messages.stream(**kwargs) as stream:
+        async for chunk in stream.text_stream:
+            await token_sink(chunk)
+            full_text += chunk
+
+    final_msg = await stream.get_final_message()
+    if final_msg.usage:
+        import usage_tracker
+        from persona import _user
+        asyncio.create_task(usage_tracker.record(
+            username=_user.get(),
+            backend="anthropic_api",
+            model_name=model_name,
+            prompt_tokens=final_msg.usage.input_tokens,
+            completion_tokens=final_msg.usage.output_tokens,
+        ))
+
+    return full_text.strip()
+
+
+async def _local_streaming(
+    token_sink, system_prompt: str, messages: list[dict], model_cfg: dict | None
+) -> str:
+    import httpx
+    import json as _json
+
+    cfg = model_cfg or {}
+    api_url   = cfg.get("api_url", "")
+    api_key   = cfg.get("api_key", "")
+    model     = cfg.get("model_name", "")
+    host_type = cfg.get("host_type", "openwebui")
+
+    if not api_url:
+        raise RuntimeError("local_api_url not configured")
+    if not model:
+        raise RuntimeError("local_model not configured")
+
+    chat_path = "/chat/completions" if host_type == "openai" else "/api/chat/completions"
+    url       = api_url.rstrip("/") + chat_path
+    headers: dict[str, str] = {"Authorization": f"Bearer {api_key}"} if api_key else {}
+
+    msgs: list[dict] = []
+    if system_prompt:
+        msgs.append({"role": "system", "content": system_prompt})
+    for m in messages:
+        msgs.append({"role": m["role"], "content": m["content"]})
+
+    payload = {"model": model, "messages": msgs, "stream": True}
+    full_text = ""
+
+    async with httpx.AsyncClient(timeout=settings.timeout_local) as client:
+        async with client.stream("POST", url, json=payload, headers=headers) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                if not line or not line.startswith("data: "):
+                    continue
+                data_str = line[6:].strip()
+                if data_str == "[DONE]":
+                    break
+                try:
+                    chunk = _json.loads(data_str)
+                    delta = (chunk["choices"][0]["delta"].get("content") or "")
+                    if delta:
+                        await token_sink(delta)
+                        full_text += delta
+                except Exception:
+                    pass
+
+    return full_text.strip()
+
+
 async def _gemini(system_prompt: str, messages: list[dict]) -> str:
    # Gemini CLI spawns MCP child processes that keep stdout pipes open after responding.
    # start_new_session=True puts the whole tree in its own process group so