feat: distill safeguards — rolling backups + sanity checks

Before any memory file is overwritten, _rotate_backup() keeps 2 rolling backups: MEMORY_*.bak1.md (most recent) and MEMORY_*.bak2.md (older). _sanity_check() now also guards against size anomalies: the new content must be between 40% and 250% of the old file size — anything outside that range looks like truncation or runaway output and aborts the write. Existing checks (min length, refusal phrases) still apply. Backup files exposed in the Files panel (ALLOWED set) so they can be reviewed and manually restored if needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 18:54:27 -04:00
parent 0ffcd57c95
commit 508fb638ad
2 changed files with 110 additions and 8 deletions
--- a/cortex/memory_distiller.py
+++ b/cortex/memory_distiller.py
@@ -1,9 +1,17 @@
 """
-Inara tiered memory distillation.
+Tiered memory distillation.

  distill_short()  — roll recent session logs → MEMORY_SHORT.md  (no LLM)
  distill_mid()    — summarize MEMORY_SHORT   → MEMORY_MID.md    (LLM)
  distill_long()   — integrate MEMORY_MID     → MEMORY_LONG.md   (LLM)
+
+Before any file is overwritten, two rolling backups are kept:
+  MEMORY_*.bak1.md — most recent backup  (created just before last write)
+  MEMORY_*.bak2.md — backup before that
+
+LLM responses are sanity-checked before writing. If the response looks like
+a refusal, is too short, or is obviously not memory content, the distill is
+aborted and the original file is left untouched.
 """
 import logging
 from datetime import datetime
@@ -16,6 +24,25 @@ logger = logging.getLogger(__name__)
 # Rough chars-per-token estimate for budget enforcement
 _CHARS_PER_TOKEN = 4

+# Phrases that indicate the LLM refused or misunderstood the task
+_REFUSAL_PREFIXES = (
+    "i'm sorry",
+    "i am sorry",
+    "i can't",
+    "i cannot",
+    "i'm unable",
+    "i am unable",
+    "as an ai",
+    "as a language model",
+    "i don't have access",
+    "i do not have access",
+    "i'm not able",
+    "i am not able",
+)
+
+# Minimum characters for a valid mid/long distill response
+_MIN_RESPONSE_CHARS = 80
+

 def _budget_chars(tokens: int) -> int:
    return tokens * _CHARS_PER_TOKEN
@@ -25,7 +52,62 @@ def _read(path: Path) -> str:
    return path.read_text() if path.exists() else ""


-def distill_short(username: str | None = None, persona: str | None = None) -> dict:
+def _rotate_backup(path: Path, n: int = 2) -> None:
+    """Rotate up to n rolling backups of path before a write.
+
+    MEMORY_LONG.md → MEMORY_LONG.bak1.md (most recent), MEMORY_LONG.bak2.md (older)
+    """
+    if not path.exists():
+        return
+    # Shift older backups down: bak(n-1) → bak(n), …, bak1 stays as bak1 source
+    for i in range(n, 1, -1):
+        older = path.parent / f"{path.stem}.bak{i}.md"
+        newer = path.parent / f"{path.stem}.bak{i - 1}.md"
+        if newer.exists():
+            older.write_text(newer.read_text())
+    # Current file → bak1
+    bak1 = path.parent / f"{path.stem}.bak1.md"
+    bak1.write_text(path.read_text())
+
+
+def _sanity_check(response_text: str, context: str, existing_content: str = "") -> str | None:
+    """Return an error string if the LLM response looks invalid, else None.
+
+    Checks:
+    - Minimum absolute length
+    - Refusal / AI preamble phrases
+    - Size shrinkage: new content must be at least 40% of the old (catches truncation)
+    - Size explosion: new content must not exceed 250% of the old (catches runaway output)
+      (Both bounds only apply when an existing file is present and reasonably sized.)
+    """
+    stripped = response_text.strip()
+    if len(stripped) < _MIN_RESPONSE_CHARS:
+        return f"{context}: response too short ({len(stripped)} chars) — not writing"
+
+    first_line = stripped.lower().splitlines()[0]
+    if any(first_line.startswith(p) for p in _REFUSAL_PREFIXES):
+        return f"{context}: response looks like a refusal — not writing"
+
+    if existing_content:
+        old_len = len(existing_content.strip())
+        new_len = len(stripped)
+        if old_len >= _MIN_RESPONSE_CHARS * 4:   # only compare when old file has real content
+            ratio = new_len / old_len
+            if ratio < 0.40:
+                return (
+                    f"{context}: new content is only {ratio:.0%} of the old "
+                    f"({new_len} vs {old_len} chars) — looks truncated, not writing"
+                )
+            if ratio > 2.50:
+                return (
+                    f"{context}: new content is {ratio:.0%} of the old "
+                    f"({new_len} vs {old_len} chars) — looks like runaway output, not writing"
+                )
+
+    return None
+
+
+def distill_short(username: str, persona: str) -> dict:
    """
    Roll the most recent session log files into MEMORY_SHORT.md.
    No LLM involved — pure aggregation with budget truncation.
@@ -64,8 +146,9 @@ def distill_short(username: str | None = None, persona: str | None = None) -> di
    )

    out_path = inara_dir / "MEMORY_SHORT.md"
+    _rotate_backup(out_path)
    out_path.write_text(header + body)
-    logger.info("distill_short: wrote %d chars from %d files", len(header) + len(body), len(parts))
+    logger.info("distill_short [%s/%s]: wrote %d chars from %d files", username, persona, len(header) + len(body), len(parts))

    return {
        "files_included": len(parts),
@@ -77,7 +160,7 @@ def distill_short(username: str | None = None, persona: str | None = None) -> di
 async def distill_mid(username: str, persona: str) -> dict:
    """
    Ask the LLM to summarize MEMORY_SHORT.md → MEMORY_MID.md.
-    Uses DISTILL_BACKEND_MID if set (e.g. "local"), otherwise primary_backend.
+    Backs up the current MEMORY_MID.md before overwriting.
    """
    from llm_client import complete
    from persona import set_context
@@ -87,6 +170,7 @@ async def distill_mid(username: str, persona: str) -> dict:

    inara_dir = _persona_path(u, p)
    short_content = _read(inara_dir / "MEMORY_SHORT.md")
+    existing_mid = _read(inara_dir / "MEMORY_MID.md")

    if not short_content.strip() or "Not yet populated" in short_content:
        return {"error": "MEMORY_SHORT.md is empty — run distill/short first"}
@@ -110,14 +194,20 @@ async def distill_mid(username: str, persona: str) -> dict:
        role="distill",
    )

+    err = _sanity_check(response_text, "distill_mid", existing_mid)
+    if err:
+        logger.warning(err)
+        return {"error": err}
+
    now = datetime.now().strftime("%Y-%m-%d %H:%M")
    header = (
        f"# MEMORY_MID.md — Mid-Term Memory Digest\n\n"
        f"*Auto-distilled: {now} via {backend}.*\n\n---\n\n"
    )
    out_path = inara_dir / "MEMORY_MID.md"
+    _rotate_backup(out_path)
    out_path.write_text(header + response_text)
-    logger.info("distill_mid: wrote %d chars via %s", len(header) + len(response_text), backend)
+    logger.info("distill_mid [%s/%s]: wrote %d chars via %s", u, p, len(header) + len(response_text), backend)

    return {
        "username": u,
@@ -130,7 +220,7 @@ async def distill_mid(username: str, persona: str) -> dict:
 async def distill_long(username: str, persona: str) -> dict:
    """
    Ask the LLM to integrate MEMORY_MID.md into MEMORY_LONG.md.
-    Uses DISTILL_BACKEND_LONG if set, otherwise primary_backend.
+    Backs up the current MEMORY_LONG.md before overwriting.
    """
    from llm_client import complete
    from persona import set_context
@@ -167,6 +257,11 @@ async def distill_long(username: str, persona: str) -> dict:
        role="distill",
    )

+    err = _sanity_check(response_text, "distill_long", long_content)
+    if err:
+        logger.warning(err)
+        return {"error": err}
+
    # Ensure the file has the right header if the LLM dropped it
    now = datetime.now().strftime("%Y-%m-%d %H:%M")
    if not response_text.lstrip().startswith("# MEMORY_LONG"):
@@ -177,8 +272,9 @@ async def distill_long(username: str, persona: str) -> dict:
        )

    out_path = inara_dir / "MEMORY_LONG.md"
+    _rotate_backup(out_path)
    out_path.write_text(response_text)
-    logger.info("distill_long: wrote %d chars via %s", len(response_text), backend)
+    logger.info("distill_long [%s/%s]: wrote %d chars via %s", u, p, len(response_text), backend)

    return {
        "username": u,
--- a/cortex/routers/files.py
+++ b/cortex/routers/files.py
@@ -16,10 +16,16 @@ ALLOWED = {
    "USER.md",
    "PROTOCOLS.md",
    "CONTEXT_TIERS.md",
-    "MEMORY.md",        # legacy — kept for reference
+    "MEMORY.md",          # legacy — kept for reference
    "MEMORY_LONG.md",
    "MEMORY_MID.md",
    "MEMORY_SHORT.md",
+    "MEMORY_LONG.bak1.md",
+    "MEMORY_LONG.bak2.md",
+    "MEMORY_MID.bak1.md",
+    "MEMORY_MID.bak2.md",
+    "MEMORY_SHORT.bak1.md",
+    "MEMORY_SHORT.bak2.md",
    "HELP.md",
 }