feat: distill safeguards — rolling backups + sanity checks

Before any memory file is overwritten, _rotate_backup() keeps 2 rolling
backups: MEMORY_*.bak1.md (most recent) and MEMORY_*.bak2.md (older).

_sanity_check() now also guards against size anomalies: the new content
must be between 40% and 250% of the old file size — anything outside that
range looks like truncation or runaway output and aborts the write.
Existing checks (min length, refusal phrases) still apply.

Backup files exposed in the Files panel (ALLOWED set) so they can be
reviewed and manually restored if needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Scott Idem
2026-05-05 18:54:27 -04:00
parent 0ffcd57c95
commit 508fb638ad
2 changed files with 110 additions and 8 deletions

View File

@@ -1,9 +1,17 @@
"""
Inara tiered memory distillation.
Tiered memory distillation.
distill_short() — roll recent session logs → MEMORY_SHORT.md (no LLM)
distill_mid() — summarize MEMORY_SHORT → MEMORY_MID.md (LLM)
distill_long() — integrate MEMORY_MID → MEMORY_LONG.md (LLM)
Before any file is overwritten, two rolling backups are kept:
MEMORY_*.bak1.md — most recent backup (created just before last write)
MEMORY_*.bak2.md — backup before that
LLM responses are sanity-checked before writing. If the response looks like
a refusal, is too short, or is obviously not memory content, the distill is
aborted and the original file is left untouched.
"""
import logging
from datetime import datetime
@@ -16,6 +24,25 @@ logger = logging.getLogger(__name__)
# Rough chars-per-token estimate for budget enforcement
_CHARS_PER_TOKEN = 4
# Phrases that indicate the LLM refused or misunderstood the task
_REFUSAL_PREFIXES = (
"i'm sorry",
"i am sorry",
"i can't",
"i cannot",
"i'm unable",
"i am unable",
"as an ai",
"as a language model",
"i don't have access",
"i do not have access",
"i'm not able",
"i am not able",
)
# Minimum characters for a valid mid/long distill response
_MIN_RESPONSE_CHARS = 80
def _budget_chars(tokens: int) -> int:
return tokens * _CHARS_PER_TOKEN
@@ -25,7 +52,62 @@ def _read(path: Path) -> str:
return path.read_text() if path.exists() else ""
def distill_short(username: str | None = None, persona: str | None = None) -> dict:
def _rotate_backup(path: Path, n: int = 2) -> None:
"""Rotate up to n rolling backups of path before a write.
MEMORY_LONG.md → MEMORY_LONG.bak1.md (most recent), MEMORY_LONG.bak2.md (older)
"""
if not path.exists():
return
# Shift older backups down: bak(n-1) → bak(n), …, bak1 stays as bak1 source
for i in range(n, 1, -1):
older = path.parent / f"{path.stem}.bak{i}.md"
newer = path.parent / f"{path.stem}.bak{i - 1}.md"
if newer.exists():
older.write_text(newer.read_text())
# Current file → bak1
bak1 = path.parent / f"{path.stem}.bak1.md"
bak1.write_text(path.read_text())
def _sanity_check(response_text: str, context: str, existing_content: str = "") -> str | None:
"""Return an error string if the LLM response looks invalid, else None.
Checks:
- Minimum absolute length
- Refusal / AI preamble phrases
- Size shrinkage: new content must be at least 40% of the old (catches truncation)
- Size explosion: new content must not exceed 250% of the old (catches runaway output)
(Both bounds only apply when an existing file is present and reasonably sized.)
"""
stripped = response_text.strip()
if len(stripped) < _MIN_RESPONSE_CHARS:
return f"{context}: response too short ({len(stripped)} chars) — not writing"
first_line = stripped.lower().splitlines()[0]
if any(first_line.startswith(p) for p in _REFUSAL_PREFIXES):
return f"{context}: response looks like a refusal — not writing"
if existing_content:
old_len = len(existing_content.strip())
new_len = len(stripped)
if old_len >= _MIN_RESPONSE_CHARS * 4: # only compare when old file has real content
ratio = new_len / old_len
if ratio < 0.40:
return (
f"{context}: new content is only {ratio:.0%} of the old "
f"({new_len} vs {old_len} chars) — looks truncated, not writing"
)
if ratio > 2.50:
return (
f"{context}: new content is {ratio:.0%} of the old "
f"({new_len} vs {old_len} chars) — looks like runaway output, not writing"
)
return None
def distill_short(username: str, persona: str) -> dict:
"""
Roll the most recent session log files into MEMORY_SHORT.md.
No LLM involved — pure aggregation with budget truncation.
@@ -64,8 +146,9 @@ def distill_short(username: str | None = None, persona: str | None = None) -> di
)
out_path = inara_dir / "MEMORY_SHORT.md"
_rotate_backup(out_path)
out_path.write_text(header + body)
logger.info("distill_short: wrote %d chars from %d files", len(header) + len(body), len(parts))
logger.info("distill_short [%s/%s]: wrote %d chars from %d files", username, persona, len(header) + len(body), len(parts))
return {
"files_included": len(parts),
@@ -77,7 +160,7 @@ def distill_short(username: str | None = None, persona: str | None = None) -> di
async def distill_mid(username: str, persona: str) -> dict:
"""
Ask the LLM to summarize MEMORY_SHORT.md → MEMORY_MID.md.
Uses DISTILL_BACKEND_MID if set (e.g. "local"), otherwise primary_backend.
Backs up the current MEMORY_MID.md before overwriting.
"""
from llm_client import complete
from persona import set_context
@@ -87,6 +170,7 @@ async def distill_mid(username: str, persona: str) -> dict:
inara_dir = _persona_path(u, p)
short_content = _read(inara_dir / "MEMORY_SHORT.md")
existing_mid = _read(inara_dir / "MEMORY_MID.md")
if not short_content.strip() or "Not yet populated" in short_content:
return {"error": "MEMORY_SHORT.md is empty — run distill/short first"}
@@ -110,14 +194,20 @@ async def distill_mid(username: str, persona: str) -> dict:
role="distill",
)
err = _sanity_check(response_text, "distill_mid", existing_mid)
if err:
logger.warning(err)
return {"error": err}
now = datetime.now().strftime("%Y-%m-%d %H:%M")
header = (
f"# MEMORY_MID.md — Mid-Term Memory Digest\n\n"
f"*Auto-distilled: {now} via {backend}.*\n\n---\n\n"
)
out_path = inara_dir / "MEMORY_MID.md"
_rotate_backup(out_path)
out_path.write_text(header + response_text)
logger.info("distill_mid: wrote %d chars via %s", len(header) + len(response_text), backend)
logger.info("distill_mid [%s/%s]: wrote %d chars via %s", u, p, len(header) + len(response_text), backend)
return {
"username": u,
@@ -130,7 +220,7 @@ async def distill_mid(username: str, persona: str) -> dict:
async def distill_long(username: str, persona: str) -> dict:
"""
Ask the LLM to integrate MEMORY_MID.md into MEMORY_LONG.md.
Uses DISTILL_BACKEND_LONG if set, otherwise primary_backend.
Backs up the current MEMORY_LONG.md before overwriting.
"""
from llm_client import complete
from persona import set_context
@@ -167,6 +257,11 @@ async def distill_long(username: str, persona: str) -> dict:
role="distill",
)
err = _sanity_check(response_text, "distill_long", long_content)
if err:
logger.warning(err)
return {"error": err}
# Ensure the file has the right header if the LLM dropped it
now = datetime.now().strftime("%Y-%m-%d %H:%M")
if not response_text.lstrip().startswith("# MEMORY_LONG"):
@@ -177,8 +272,9 @@ async def distill_long(username: str, persona: str) -> dict:
)
out_path = inara_dir / "MEMORY_LONG.md"
_rotate_backup(out_path)
out_path.write_text(response_text)
logger.info("distill_long: wrote %d chars via %s", len(response_text), backend)
logger.info("distill_long [%s/%s]: wrote %d chars via %s", u, p, len(response_text), backend)
return {
"username": u,

View File

@@ -16,10 +16,16 @@ ALLOWED = {
"USER.md",
"PROTOCOLS.md",
"CONTEXT_TIERS.md",
"MEMORY.md", # legacy — kept for reference
"MEMORY.md", # legacy — kept for reference
"MEMORY_LONG.md",
"MEMORY_MID.md",
"MEMORY_SHORT.md",
"MEMORY_LONG.bak1.md",
"MEMORY_LONG.bak2.md",
"MEMORY_MID.bak1.md",
"MEMORY_MID.bak2.md",
"MEMORY_SHORT.bak1.md",
"MEMORY_SHORT.bak2.md",
"HELP.md",
}