feat: token streaming for orchestrator final response
Switches the orchestrator's final response from a fire-and-wait model to a
live SSE stream so text appears token-by-token as the model generates it.
- llm_client: complete() gains token_sink param; anthropic_api backend uses
client.messages.stream(); local backend uses httpx SSE streaming; non-streaming
backends (claude_cli, gemini_cli) emit the full text as one chunk
- orchestrator_engine + openai_orchestrator: token_sink threaded through run(),
_run_from_contents(), _claude_handoff(), and _run_from_messages()
- routers/orchestrator: each job gets an asyncio.Queue; _on_progress and
_token_sink write progress/token events to it; _finalize_job emits done,
error handler emits error, confirmation gate emits confirm; new GET
/orchestrate/{job_id}/stream SSE endpoint with 20s keepalive
- app.js: _doOrchestrate switches from 2s poll loop to EventSource; thinking
bubble converts to a streaming message on first token; auto-scroll while
streaming; confirm/error/done events handled; finalization unchanged
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -53,6 +53,7 @@ async def complete(
|
||||
slot: str | None = None,
|
||||
max_tokens: int = 2048,
|
||||
attachment: dict | None = None,
|
||||
token_sink=None, # async (str) -> None; if set, stream tokens as they arrive
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Returns (response_text, actual_backend_used).
|
||||
@@ -98,7 +99,8 @@ async def complete(
|
||||
fallback = _FALLBACK.get(primary, "claude")
|
||||
|
||||
try:
|
||||
response = await _dispatch(primary, system_prompt, messages, resolved_cfg, attachment=attachment)
|
||||
response = await _dispatch(primary, system_prompt, messages, resolved_cfg,
|
||||
attachment=attachment, token_sink=token_sink)
|
||||
return response, primary
|
||||
except Exception as e:
|
||||
err_str = str(e)
|
||||
@@ -109,7 +111,7 @@ async def complete(
|
||||
logger.error("%s failed (no fallback — model explicitly configured): %s", primary, e)
|
||||
raise
|
||||
logger.warning("%s failed (%s) — falling back to %s", primary, e, fallback)
|
||||
response = await _dispatch(fallback, system_prompt, messages, None)
|
||||
response = await _dispatch(fallback, system_prompt, messages, None, token_sink=token_sink)
|
||||
return response, fallback
|
||||
|
||||
|
||||
@@ -119,14 +121,24 @@ async def _dispatch(
|
||||
messages: list[dict],
|
||||
model_cfg: dict | None,
|
||||
attachment: dict | None = None,
|
||||
token_sink=None,
|
||||
) -> str:
|
||||
if backend == "gemini":
|
||||
return await _gemini(system_prompt, messages)
|
||||
if backend == "local":
|
||||
return await _local(system_prompt, messages, model_cfg, attachment=attachment)
|
||||
if backend == "anthropic_api":
|
||||
return await _anthropic_api(system_prompt, messages, model_cfg)
|
||||
return await _claude(system_prompt, messages, model_cfg)
|
||||
text = await _gemini(system_prompt, messages)
|
||||
elif backend == "local":
|
||||
if token_sink:
|
||||
return await _local_streaming(token_sink, system_prompt, messages, model_cfg)
|
||||
text = await _local(system_prompt, messages, model_cfg, attachment=attachment)
|
||||
elif backend == "anthropic_api":
|
||||
if token_sink:
|
||||
return await _anthropic_api_streaming(token_sink, system_prompt, messages, model_cfg)
|
||||
text = await _anthropic_api(system_prompt, messages, model_cfg)
|
||||
else:
|
||||
text = await _claude(system_prompt, messages, model_cfg)
|
||||
# For non-streaming backends when token_sink is provided, emit the full text as one chunk.
|
||||
if token_sink and text:
|
||||
await token_sink(text)
|
||||
return text
|
||||
|
||||
|
||||
def _fresh_claude_token() -> str | None:
|
||||
@@ -302,6 +314,99 @@ async def _anthropic_api(system_prompt: str, messages: list[dict], model_cfg: di
|
||||
return text.strip()
|
||||
|
||||
|
||||
async def _anthropic_api_streaming(
|
||||
token_sink, system_prompt: str, messages: list[dict], model_cfg: dict | None
|
||||
) -> str:
|
||||
try:
|
||||
import anthropic
|
||||
except ImportError:
|
||||
raise RuntimeError("anthropic SDK not installed — run: pip install 'anthropic>=0.40.0'")
|
||||
|
||||
cfg = model_cfg or {}
|
||||
api_key = cfg.get("api_key", "")
|
||||
model_name = cfg.get("model_name") or settings.default_model
|
||||
|
||||
if not api_key:
|
||||
raise RuntimeError("No Anthropic API key — add one at /settings/models")
|
||||
|
||||
client = anthropic.AsyncAnthropic(api_key=api_key)
|
||||
msgs = [{"role": m["role"], "content": m["content"]} for m in messages]
|
||||
kwargs: dict = {"model": model_name, "max_tokens": 4096, "messages": msgs}
|
||||
if system_prompt:
|
||||
kwargs["system"] = system_prompt
|
||||
|
||||
full_text = ""
|
||||
async with client.messages.stream(**kwargs) as stream:
|
||||
async for chunk in stream.text_stream:
|
||||
await token_sink(chunk)
|
||||
full_text += chunk
|
||||
|
||||
final_msg = await stream.get_final_message()
|
||||
if final_msg.usage:
|
||||
import usage_tracker
|
||||
from persona import _user
|
||||
asyncio.create_task(usage_tracker.record(
|
||||
username=_user.get(),
|
||||
backend="anthropic_api",
|
||||
model_name=model_name,
|
||||
prompt_tokens=final_msg.usage.input_tokens,
|
||||
completion_tokens=final_msg.usage.output_tokens,
|
||||
))
|
||||
|
||||
return full_text.strip()
|
||||
|
||||
|
||||
async def _local_streaming(
|
||||
token_sink, system_prompt: str, messages: list[dict], model_cfg: dict | None
|
||||
) -> str:
|
||||
import httpx
|
||||
import json as _json
|
||||
|
||||
cfg = model_cfg or {}
|
||||
api_url = cfg.get("api_url", "")
|
||||
api_key = cfg.get("api_key", "")
|
||||
model = cfg.get("model_name", "")
|
||||
host_type = cfg.get("host_type", "openwebui")
|
||||
|
||||
if not api_url:
|
||||
raise RuntimeError("local_api_url not configured")
|
||||
if not model:
|
||||
raise RuntimeError("local_model not configured")
|
||||
|
||||
chat_path = "/chat/completions" if host_type == "openai" else "/api/chat/completions"
|
||||
url = api_url.rstrip("/") + chat_path
|
||||
headers: dict[str, str] = {"Authorization": f"Bearer {api_key}"} if api_key else {}
|
||||
|
||||
msgs: list[dict] = []
|
||||
if system_prompt:
|
||||
msgs.append({"role": "system", "content": system_prompt})
|
||||
for m in messages:
|
||||
msgs.append({"role": m["role"], "content": m["content"]})
|
||||
|
||||
payload = {"model": model, "messages": msgs, "stream": True}
|
||||
full_text = ""
|
||||
|
||||
async with httpx.AsyncClient(timeout=settings.timeout_local) as client:
|
||||
async with client.stream("POST", url, json=payload, headers=headers) as resp:
|
||||
resp.raise_for_status()
|
||||
async for line in resp.aiter_lines():
|
||||
if not line or not line.startswith("data: "):
|
||||
continue
|
||||
data_str = line[6:].strip()
|
||||
if data_str == "[DONE]":
|
||||
break
|
||||
try:
|
||||
chunk = _json.loads(data_str)
|
||||
delta = (chunk["choices"][0]["delta"].get("content") or "")
|
||||
if delta:
|
||||
await token_sink(delta)
|
||||
full_text += delta
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return full_text.strip()
|
||||
|
||||
|
||||
async def _gemini(system_prompt: str, messages: list[dict]) -> str:
|
||||
# Gemini CLI spawns MCP child processes that keep stdout pipes open after responding.
|
||||
# start_new_session=True puts the whole tree in its own process group so
|
||||
|
||||
Reference in New Issue
Block a user