feat: janitor role — session checkpoint compaction

New cortex/janitor.py runs before each orchestrator dispatch. When a session exceeds 20 user turns or ~12K estimated tokens, the oldest half is summarized by the janitor role model and replaced with a compact checkpoint message. Fail-safe: always returns original history if the model call fails. Config: JANITOR_TURN_THRESHOLD, JANITOR_TOKEN_THRESHOLD in .env. Assign Gemma E4B or Haiku 4.5 to the janitor role for effectively-free compaction. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-17 21:32:54 -04:00
parent 32585804dd
commit 67f5db70a3
4 changed files with 149 additions and 38 deletions
--- a/cortex/config.py
+++ b/cortex/config.py
@@ -71,13 +71,20 @@ class Settings(BaseSettings):
    role_chat: str = "claude_cli"
    role_orchestrator: str = "gemini_api"
    role_distill: str = "claude_cli"
+    role_janitor: str = "claude_cli"   # assign a cheap/fast model: Haiku 4.5, local Gemma E4B
    role_coder: str = "claude_cli"
    role_research: str = "gemini_api"

    # Comma-separated list of standard roles shown in the model settings UI.
    # Add custom roles here to extend the UI without code changes.
-    # Example: DEFINED_ROLES=chat,orchestrator,distill,coder,research,medical
-    defined_roles: str = "chat,orchestrator,distill,coder,research"
+    # Example: DEFINED_ROLES=chat,orchestrator,distill,janitor,coder,research,medical
+    defined_roles: str = "chat,orchestrator,distill,janitor,coder,research"
+
+    # Session checkpoint compaction ("janitor") thresholds.
+    # Compaction fires when EITHER threshold is exceeded.
+    # Override in .env: JANITOR_TURN_THRESHOLD=15  JANITOR_TOKEN_THRESHOLD=8000
+    janitor_turn_threshold: int = 20    # user turns (each turn = 1 user + 1 assistant message)
+    janitor_token_threshold: int = 12000  # estimated tokens (chars / 4 heuristic)

    # Memory tier token budgets — soft caps used during distillation
    # Override in .env: MEMORY_BUDGET_LONG=4000 etc.
--- a/cortex/janitor.py
+++ b/cortex/janitor.py
@@ -0,0 +1,117 @@
+"""
+Session checkpoint compaction ("janitor").
+
+Called before each orchestrator run. When a session exceeds the configured turn
+or token threshold, the oldest half of the history is summarized by the janitor
+role model and replaced with a compact checkpoint message. This keeps the token
+count passed to the orchestrator lean while preserving a faithful record of what
+happened earlier in the session.
+
+The janitor role should be assigned a cheap, fast model — a small local model
+(Gemma E4B) or a lightweight cloud model (Haiku 4.5). It has no tools and the
+task is simple enough that quality matters less than speed and cost.
+
+Thresholds (configurable in .env):
+  JANITOR_TURN_THRESHOLD  — compact after N user turns  (default: 20)
+  JANITOR_TOKEN_THRESHOLD — compact after ~N estimated tokens (default: 12000)
+"""
+
+import logging
+
+from config import settings
+
+logger = logging.getLogger(__name__)
+
+_SYSTEM = "You are a concise summarizer. Write only the summary — no preamble, no labels."
+
+_PROMPT_TMPL = """\
+Summarize the conversation below in 3–8 sentences. Capture what was discussed, \
+any decisions or conclusions reached, and key specifics (names, values, file paths, etc.). \
+Write only the summary paragraph.
+
+CONVERSATION:
+{conversation}"""
+
+
+def _format_messages(messages: list[dict]) -> str:
+    lines = []
+    for m in messages:
+        role = m.get("role", "unknown").upper()
+        content = (m.get("content") or "").strip()
+        if not content:
+            continue
+        # Cap individual messages so the prompt stays manageable for small models
+        if len(content) > 600:
+            content = content[:600] + "…"
+        lines.append(f"[{role}]: {content}")
+    return "\n".join(lines)
+
+
+async def maybe_checkpoint(session_id: str) -> list[dict]:
+    """
+    Load the session, compact if thresholds are exceeded, and return the
+    message list to use for the upcoming orchestrator run.
+
+    Always returns a list — returns the original (unchanged) list if:
+      - the session does not exist yet
+      - thresholds are not met
+      - the janitor model call fails (fail-safe: never discard history)
+    """
+    from session_store import load, save
+
+    messages = load(session_id)
+    if not messages:
+        return []
+
+    turn_count = sum(1 for m in messages if m["role"] == "user")
+    estimated_tokens = sum(len(m.get("content") or "") for m in messages) // 4
+
+    if (turn_count < settings.janitor_turn_threshold
+            and estimated_tokens < settings.janitor_token_threshold):
+        return messages
+
+    # Walk back to a clean turn boundary so we never split mid-exchange.
+    # midpoint lands on an "assistant" message boundary.
+    midpoint = len(messages) // 2
+    while midpoint > 0 and messages[midpoint - 1].get("role") != "assistant":
+        midpoint -= 1
+
+    if midpoint < 4:
+        # Too short to compact meaningfully — threshold likely set very low
+        return messages
+
+    old_messages = messages[:midpoint]
+    recent_messages = messages[midpoint:]
+
+    conversation_text = _format_messages(old_messages)
+    summary_prompt = _PROMPT_TMPL.format(conversation=conversation_text)
+
+    try:
+        from llm_client import complete as llm_complete
+        summary, backend = await llm_complete(
+            system_prompt=_SYSTEM,
+            messages=[{"role": "user", "content": summary_prompt}],
+            role="janitor",
+        )
+
+        checkpoint_msg = {
+            "role": "assistant",
+            "content": (
+                f"[Session checkpoint — {len(old_messages)} messages summarized "
+                f"via {backend}]\n\n{summary.strip()}"
+            ),
+        }
+
+        compacted = [checkpoint_msg] + recent_messages
+        save(session_id, compacted)
+
+        logger.info(
+            "Janitor: session=%s compacted %d→%d messages (turns=%d ~%d tokens) via %s",
+            session_id, len(messages), len(compacted), turn_count, estimated_tokens, backend,
+        )
+        return compacted
+
+    except Exception as exc:
+        # Fail-safe: never lose history because the janitor model is unavailable
+        logger.warning("Janitor skipped for session %s: %s", session_id, exc)
+        return messages
--- a/cortex/routers/orchestrator.py
+++ b/cortex/routers/orchestrator.py
@@ -257,6 +257,7 @@ async def _run_job(job_id: str, req: OrchestrateRequest, user: str) -> None:

    try:
        from session_store import load as load_session, save as save_session, generate_session_id
+        from janitor import maybe_checkpoint as janitor_checkpoint

        tier = req.tier or settings.default_tier
        role_cfg = model_registry.get_role_config(user, req.chat_role)
@@ -272,7 +273,8 @@ async def _run_job(job_id: str, req: OrchestrateRequest, user: str) -> None:
        )

        session_id = req.session_id or generate_session_id()
-        history = load_session(session_id)
+        # Compact old session turns before dispatching — no-op on new sessions or short ones.
+        history = await janitor_checkpoint(session_id) if req.session_id else load_session(session_id)
        session_messages = history or None

        orch_model = model_registry.get_model_for_role(user, "orchestrator")