Cortex-Inara/cortex/agent_manager.py

"""
Agent lifecycle manager — registry for background spawn_agent and aider_run tasks.

Tracks running and recently completed agents in-process. On completion, fires
notification.notify() if notify=True (same channel used by reminders and cron jobs).

Records are kept for 24 hours after completion, then pruned on next registration.
"""

import asyncio
import logging
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timedelta

logger = logging.getLogger(__name__)

_PRUNE_AFTER = timedelta(hours=24)
_RESULT_PREVIEW_CHARS = 500
_TASK_PREVIEW_CHARS = 200


@dataclass
class AgentRecord:
    agent_id: str
    level: int              # 1 = persona, 2 = specialized sub-agent, 3 = support agent
    role: str               # e.g. "coder", "research", "chat"
    task: str               # first _TASK_PREVIEW_CHARS of the task
    status: str             # running / done / failed / cancelled / timeout
    started: datetime
    user: str
    parent_id: str | None = None          # agent_id of the spawner (lineage tracking)
    finished: datetime | None = None
    result: str | None = None             # first _RESULT_PREVIEW_CHARS on completion
    notify: bool = False                  # push notification on completion
    _task_ref: "asyncio.Task | None" = field(default=None, repr=False)


# Module-level registry — in-process only, not persisted across restarts.
_agents: dict[str, AgentRecord] = {}
_lock = asyncio.Lock()


async def register(
    user: str,
    role: str,
    task: str,
    level: int = 2,
    parent_id: str | None = None,
    notify: bool = False,
) -> AgentRecord:
    """Create and register a new running agent. Returns the record (agent_id is set)."""
    agent_id = str(uuid.uuid4())
    rec = AgentRecord(
        agent_id=agent_id,
        level=level,
        role=role,
        task=task[:_TASK_PREVIEW_CHARS],
        status="running",
        started=datetime.now(),
        user=user,
        parent_id=parent_id,
        notify=notify,
    )
    async with _lock:
        _prune_locked()
        _agents[agent_id] = rec
    logger.info(
        "agent_manager: registered %s role=%s level=%d user=%s task=%.60s",
        agent_id[:8], role, level, user, task,
    )
    return rec


def set_task_ref(agent_id: str, task_ref: "asyncio.Task") -> None:
    """Store the asyncio.Task reference so it can be cancelled later.

    Call immediately after asyncio.create_task() — before the event loop yields.
    """
    rec = _agents.get(agent_id)
    if rec:
        rec._task_ref = task_ref


async def finish(agent_id: str, result: str, status: str = "done") -> None:
    """Mark an agent complete, store the result, and notify the user if requested."""
    async with _lock:
        rec = _agents.get(agent_id)
        if not rec:
            return
        rec.status = status
        rec.finished = datetime.now()
        rec.result = (result or "")[:_RESULT_PREVIEW_CHARS]

    logger.info("agent_manager: finished %s status=%s", agent_id[:8], status)

    if rec.notify and status != "cancelled":
        try:
            from notification import notify as _notify
            elapsed = int((rec.finished - rec.started).total_seconds())
            emoji = "✅" if status == "done" else "⚠️"
            preview = (rec.result or "(no output)")[:200]
            msg = f"{emoji} Agent done [{rec.role}, {elapsed}s]: {preview}"
            await _notify(rec.user, msg)
        except Exception as e:
            logger.warning("agent_manager: notification failed for %s: %s", agent_id[:8], e)


async def cancel_agent(agent_id: str, user: str) -> str:
    """Cancel a running background agent. Returns a human-readable status message."""
    async with _lock:
        rec = _agents.get(agent_id)
        if not rec:
            return f"No agent found: {agent_id}"
        if rec.user != user:
            return "Access denied."
        if rec.status != "running":
            return f"Agent {agent_id[:8]}… is already {rec.status}."
        task_ref = rec._task_ref
        rec.status = "cancelled"
        rec.finished = datetime.now()

    if task_ref and not task_ref.done():
        task_ref.cancel()

    logger.info("agent_manager: cancelled %s by user=%s", agent_id[:8], user)
    return f"Agent {agent_id[:8]}… cancelled."


def get(agent_id: str) -> AgentRecord | None:
    """Look up an agent record by ID."""
    return _agents.get(agent_id)


def list_agents(user: str, status: str | None = None, limit: int = 10) -> list[AgentRecord]:
    """Return recent agents for a user, newest first.

    Does not acquire the lock — safe for read-only listing (Python dict iteration is
    thread-safe for reads; we don't care about racing with a concurrent registration).
    """
    records = [r for r in _agents.values() if r.user == user]
    if status:
        records = [r for r in records if r.status == status]
    records.sort(key=lambda r: r.started, reverse=True)
    return records[:limit]


def _prune_locked() -> None:
    """Remove completed agents older than _PRUNE_AFTER. Must be called inside _lock."""
    cutoff = datetime.now() - _PRUNE_AFTER
    stale = [
        aid for aid, r in _agents.items()
        if r.status != "running" and r.finished and r.finished < cutoff
    ]
    for aid in stale:
        del _agents[aid]
    if stale:
        logger.debug("agent_manager: pruned %d stale records", len(stale))