From 67f5db70a37847b5d013d2ad5dbff9ce1df9294b Mon Sep 17 00:00:00 2001 From: Scott Idem Date: Wed, 17 Jun 2026 21:32:54 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20janitor=20role=20=E2=80=94=20session=20?= =?UTF-8?q?checkpoint=20compaction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New cortex/janitor.py runs before each orchestrator dispatch. When a session exceeds 20 user turns or ~12K estimated tokens, the oldest half is summarized by the janitor role model and replaced with a compact checkpoint message. Fail-safe: always returns original history if the model call fails. Config: JANITOR_TURN_THRESHOLD, JANITOR_TOKEN_THRESHOLD in .env. Assign Gemma E4B or Haiku 4.5 to the janitor role for effectively-free compaction. Co-Authored-By: Claude Sonnet 4.6 --- cortex/config.py | 11 +++- cortex/janitor.py | 117 +++++++++++++++++++++++++++++++++ cortex/routers/orchestrator.py | 4 +- documentation/TODO__Agents.md | 55 ++++++---------- 4 files changed, 149 insertions(+), 38 deletions(-) create mode 100644 cortex/janitor.py diff --git a/cortex/config.py b/cortex/config.py index dba6e90..4b591a3 100644 --- a/cortex/config.py +++ b/cortex/config.py @@ -71,13 +71,20 @@ class Settings(BaseSettings): role_chat: str = "claude_cli" role_orchestrator: str = "gemini_api" role_distill: str = "claude_cli" + role_janitor: str = "claude_cli" # assign a cheap/fast model: Haiku 4.5, local Gemma E4B role_coder: str = "claude_cli" role_research: str = "gemini_api" # Comma-separated list of standard roles shown in the model settings UI. # Add custom roles here to extend the UI without code changes. - # Example: DEFINED_ROLES=chat,orchestrator,distill,coder,research,medical - defined_roles: str = "chat,orchestrator,distill,coder,research" + # Example: DEFINED_ROLES=chat,orchestrator,distill,janitor,coder,research,medical + defined_roles: str = "chat,orchestrator,distill,janitor,coder,research" + + # Session checkpoint compaction ("janitor") thresholds. + # Compaction fires when EITHER threshold is exceeded. + # Override in .env: JANITOR_TURN_THRESHOLD=15 JANITOR_TOKEN_THRESHOLD=8000 + janitor_turn_threshold: int = 20 # user turns (each turn = 1 user + 1 assistant message) + janitor_token_threshold: int = 12000 # estimated tokens (chars / 4 heuristic) # Memory tier token budgets — soft caps used during distillation # Override in .env: MEMORY_BUDGET_LONG=4000 etc. diff --git a/cortex/janitor.py b/cortex/janitor.py new file mode 100644 index 0000000..75f379d --- /dev/null +++ b/cortex/janitor.py @@ -0,0 +1,117 @@ +""" +Session checkpoint compaction ("janitor"). + +Called before each orchestrator run. When a session exceeds the configured turn +or token threshold, the oldest half of the history is summarized by the janitor +role model and replaced with a compact checkpoint message. This keeps the token +count passed to the orchestrator lean while preserving a faithful record of what +happened earlier in the session. + +The janitor role should be assigned a cheap, fast model — a small local model +(Gemma E4B) or a lightweight cloud model (Haiku 4.5). It has no tools and the +task is simple enough that quality matters less than speed and cost. + +Thresholds (configurable in .env): + JANITOR_TURN_THRESHOLD — compact after N user turns (default: 20) + JANITOR_TOKEN_THRESHOLD — compact after ~N estimated tokens (default: 12000) +""" + +import logging + +from config import settings + +logger = logging.getLogger(__name__) + +_SYSTEM = "You are a concise summarizer. Write only the summary — no preamble, no labels." + +_PROMPT_TMPL = """\ +Summarize the conversation below in 3–8 sentences. Capture what was discussed, \ +any decisions or conclusions reached, and key specifics (names, values, file paths, etc.). \ +Write only the summary paragraph. + +CONVERSATION: +{conversation}""" + + +def _format_messages(messages: list[dict]) -> str: + lines = [] + for m in messages: + role = m.get("role", "unknown").upper() + content = (m.get("content") or "").strip() + if not content: + continue + # Cap individual messages so the prompt stays manageable for small models + if len(content) > 600: + content = content[:600] + "…" + lines.append(f"[{role}]: {content}") + return "\n".join(lines) + + +async def maybe_checkpoint(session_id: str) -> list[dict]: + """ + Load the session, compact if thresholds are exceeded, and return the + message list to use for the upcoming orchestrator run. + + Always returns a list — returns the original (unchanged) list if: + - the session does not exist yet + - thresholds are not met + - the janitor model call fails (fail-safe: never discard history) + """ + from session_store import load, save + + messages = load(session_id) + if not messages: + return [] + + turn_count = sum(1 for m in messages if m["role"] == "user") + estimated_tokens = sum(len(m.get("content") or "") for m in messages) // 4 + + if (turn_count < settings.janitor_turn_threshold + and estimated_tokens < settings.janitor_token_threshold): + return messages + + # Walk back to a clean turn boundary so we never split mid-exchange. + # midpoint lands on an "assistant" message boundary. + midpoint = len(messages) // 2 + while midpoint > 0 and messages[midpoint - 1].get("role") != "assistant": + midpoint -= 1 + + if midpoint < 4: + # Too short to compact meaningfully — threshold likely set very low + return messages + + old_messages = messages[:midpoint] + recent_messages = messages[midpoint:] + + conversation_text = _format_messages(old_messages) + summary_prompt = _PROMPT_TMPL.format(conversation=conversation_text) + + try: + from llm_client import complete as llm_complete + summary, backend = await llm_complete( + system_prompt=_SYSTEM, + messages=[{"role": "user", "content": summary_prompt}], + role="janitor", + ) + + checkpoint_msg = { + "role": "assistant", + "content": ( + f"[Session checkpoint — {len(old_messages)} messages summarized " + f"via {backend}]\n\n{summary.strip()}" + ), + } + + compacted = [checkpoint_msg] + recent_messages + save(session_id, compacted) + + logger.info( + "Janitor: session=%s compacted %d→%d messages (turns=%d ~%d tokens) via %s", + session_id, len(messages), len(compacted), turn_count, estimated_tokens, backend, + ) + return compacted + + except Exception as exc: + # Fail-safe: never lose history because the janitor model is unavailable + logger.warning("Janitor skipped for session %s: %s", session_id, exc) + return messages diff --git a/cortex/routers/orchestrator.py b/cortex/routers/orchestrator.py index 2ba26d6..3446a0b 100644 --- a/cortex/routers/orchestrator.py +++ b/cortex/routers/orchestrator.py @@ -257,6 +257,7 @@ async def _run_job(job_id: str, req: OrchestrateRequest, user: str) -> None: try: from session_store import load as load_session, save as save_session, generate_session_id + from janitor import maybe_checkpoint as janitor_checkpoint tier = req.tier or settings.default_tier role_cfg = model_registry.get_role_config(user, req.chat_role) @@ -272,7 +273,8 @@ async def _run_job(job_id: str, req: OrchestrateRequest, user: str) -> None: ) session_id = req.session_id or generate_session_id() - history = load_session(session_id) + # Compact old session turns before dispatching — no-op on new sessions or short ones. + history = await janitor_checkpoint(session_id) if req.session_id else load_session(session_id) session_messages = history or None orch_model = model_registry.get_model_for_role(user, "orchestrator") diff --git a/documentation/TODO__Agents.md b/documentation/TODO__Agents.md index 495ed39..c8a5c3d 100644 --- a/documentation/TODO__Agents.md +++ b/documentation/TODO__Agents.md @@ -44,7 +44,7 @@ automatically. Remaining work is quality/reliability parity, not ground-up desig - [x] Retry logic on transient API errors (connection timeout, 429, 503) — 2026-05-09 - `_chat_with_retry()` helper in `openai_orchestrator.py`; 3 attempts, exponential backoff (1s, 2s) - Retries on `APIConnectionError` and `APIStatusError` with status 429/500/502/503/504 -- [ ] Test end-to-end with Gemma 4 E4B and 26B A4B on scott_gaming +- [x] Test end-to-end with Gemma 4 E4B and 26B A4B on scott_gaming — 2026-06-17 - [ ] Review `ARCH__FUTURE.md` agent architecture ideas before finalising design - Reference: `docs/OPEN_WEBUI_API.md`, `documentation/ARCH__FUTURE.md` §1 @@ -211,43 +211,28 @@ Upload an image or document inline and have it flow into context. - [x] Text/code files read as UTF-8, injected as fenced code block in message - [x] Thumbnail/filename shown above sent message in UI -### [Intelligence] Session checkpoint compaction — "janitor" role -Proactive in-session context pruning using a cheap/fast model to keep expensive -model costs down as sessions grow. Not continuous per-token — checkpoint-triggered. +### [Intelligence] Session checkpoint compaction — "janitor" role ✅ — 2026-06-17 +Proactive in-session context pruning using a cheap/fast model. Fires before each +orchestrator run; compacts oldest half of history when either threshold is exceeded. -**Design:** -- New `janitor` role in the model registry (alongside `chat`, `orchestrator`, `distill`) - - Assign a cheap/fast model: Haiku 4.5, local Gemma E4B, or similar - - Falls back to the `distill` role model if `janitor` is not configured -- Trigger condition (either/or): session exceeds N turns (e.g. 20) OR estimated token - count exceeds a threshold (e.g. 12K tokens of history) -- On trigger: call janitor model with the oldest half of session history; ask it to - write a compact "what we've established so far" summary block (3–8 sentences) -- Replace the compacted turns with a single synthetic `assistant` message: - `[Session checkpoint — {N} turns summarized]: {summary}` -- The remaining recent turns stay untouched — only the stale prefix is replaced -- Token estimate: count chars / 4 as a cheap heuristic; no exact tokenizer needed +- [x] **`cortex/janitor.py`** — `maybe_checkpoint(session_id)` — loads session, + checks `janitor_turn_threshold` (default 20) and `janitor_token_threshold` + (default 12000 estimated tokens); finds a clean turn boundary; calls janitor + role model with the oldest half; replaces compacted messages with a single + `[Session checkpoint — N messages summarized via {backend}]` assistant message; + fail-safe returns original messages if model call fails — 2026-06-17 +- [x] **`cortex/config.py`** — `janitor_turn_threshold`, `janitor_token_threshold`, + `role_janitor` settings; `janitor` added to `defined_roles` — 2026-06-17 +- [x] **`cortex/routers/orchestrator.py`** — calls `janitor_checkpoint(session_id)` + before dispatching to either orchestrator engine; no-op on new sessions — 2026-06-17 +- [x] **`model_registry.py`** — `janitor` already in `REQUIRED_ROLES`, + `ROLE_DEFAULT_TOOLS` (no tools), and `_ROLE_LAST_RESORT` from earlier session -**Files to change:** -- `model_registry.py` — add `janitor` to `ROLE_DEFAULT_TOOLS` (empty list — no tools) - and to the roles UI in `settings/models` -- `session_store.py` — add `maybe_checkpoint(session_id)` that checks turn count / - estimated tokens and calls the janitor model if threshold is exceeded -- `openai_orchestrator.py` — call `maybe_checkpoint()` at the start of each run, - before building the active tool list and context -- `orchestrator_engine.py` — same, before building the Gemini context -- Settings UI — expose janitor turn/token thresholds as configurable values - (default: 20 turns or 12K history tokens) +**To configure:** assign Gemma E4B (local, free) or Haiku 4.5 to the `janitor` role +in Settings → Model Registry. Thresholds overridable in `.env`: +`JANITOR_TURN_THRESHOLD=15 JANITOR_TOKEN_THRESHOLD=8000` -**Economics:** -- Haiku 4.5: ~$0.80/1M input — compacting 10K tokens costs ~$0.008 -- Saves 8–12K tokens on every subsequent Sonnet/Opus call in that session -- Break-even after 1–2 expensive model calls post-checkpoint -- Local janitor (Gemma E4B) = effectively free; ideal default when available - -**Not needed yet** — most sessions are short enough that existing `_compact_messages()` -heuristic handles the worst cases. Priority rises with dev-agent pipeline work where -aider tool results can be very large. +**Deferred:** Settings UI sliders for thresholds (low value — .env is sufficient) ### [UX] Token streaming for orchestrator final response ✅ — 2026-06-16 Text appears token-by-token while the model is generating, instead of waiting for the