diff --git a/documentation/TODO__Agents.md b/documentation/TODO__Agents.md index 15441cdb..cf667a33 100644 --- a/documentation/TODO__Agents.md +++ b/documentation/TODO__Agents.md @@ -157,7 +157,7 @@ below. The TTL + `verify_in_flight` guards are the current mitigation. --- ### [API] GET/POST retry hardening — differentiate timeout aborts vs intentional aborts -**Status:** 🚧 Planned follow-up (2026-05-21) +**Status:** ✅ Completed (2026-05-21) Recent API helper fixes restored retry/backoff for transient network `TypeError` failures. Current remaining gap: timeout-triggered aborts are treated the same as intentional/user @@ -165,13 +165,15 @@ aborts, so retries are skipped in both `api_get_object.ts` and `api_post_object. **Decision (for now):** Keep the global default timeout at **20s**. -**What needs to be implemented:** -- Separate abort reasons in GET/POST helpers: +**Implemented:** +- GET/POST now explicitly distinguish abort class in helper code: - **Intentional abort** (navigation/unmount/caller cancel): fail fast, no retry - - **Timeout abort** (helper's own timer): eligible for retry/backoff (same class as transient network) -- Add explicit timeout classification in code (not just `AbortError` name check), so the retry - loop can make a deterministic decision. -- Keep existing capped backoff behavior (`2s -> 4s -> 6s -> 8s`) for retryable timeout/network failures. + - **Timeout abort** (helper timer): retryable via existing retry loop +- Timeout classification added with per-attempt timeout flag (not `AbortError` name-only logic). +- Backoff behavior retained for retryable failures (`2s -> 4s -> 6s -> 8s`, cap 8s). +- Validation done: + - `npx svelte-check` clean + - API Playwright tests updated/fixed and passing (`v3_api_security.modern`, `v3_api_nested_crud`) **Timeout policy improvement (class-based):** - Keep **20s default** as baseline. @@ -195,6 +197,51 @@ aborts, so retries are skipped in both `api_get_object.ts` and `api_post_object. --- +### [API] PATCH/DELETE retry hardening — parity with GET/POST +**Status:** 🚧 In progress (PATCH first, then DELETE) + +Current behavior in `api_patch_object.ts` and `api_delete_object.ts` has retry loops, +but does not yet have GET/POST parity for abort classification and backoff policy. + +**Plan (sequenced):** +- **Step 1 (now): PATCH parity** + - Add timeout-vs-intentional abort separation. + - Retry only timeout/network transient class. + - Keep fail-fast behavior for 400/401/403/422. + - Add capped backoff (`2s -> 4s -> 6s -> 8s`). +- **Step 2 (after PATCH validation): DELETE parity** + - Apply same classification and backoff strategy. + - Preserve existing delete semantics for client/auth failures. + +**Mutation safety note:** +- PATCH/DELETE can have ambiguous commit state on timeout. Current policy is conservative: + retries target obvious transient failure class (timeout/network), while caller aborts remain + fail-fast to avoid duplicate side effects during navigation/unmount flows. + +**Acceptance criteria:** +- PATCH and DELETE timeout-aborts retry under capped backoff. +- Caller/navigation aborts do not retry. +- No regression for 400/401/403/422 fail-fast behavior. +- `npx svelte-check` clean, API-focused Playwright tests remain green. + +--- + +### [Testing] V3 API performance probe (basic stress rounds) +**Status:** ✅ Completed baseline harness (2026-05-21) + +Implemented a gated Playwright probe for quick repeated list-query timing against live V3 endpoints. + +**Files:** +- `tests/v3_api_latency_probe.test.ts` +- `tests/README.md` (run/tuning docs) + +**Current capabilities:** +- Measures rounds for event sessions, journal entries, and user lists. +- Writes per-run JSON + Markdown reports to `tests/results/`. +- Optional anomaly thresholds for error-rate / p95 / empty-row detection. + +--- + ### [Launcher/VLC] Linux playback — fullscreen + pause-on-end not working **Status:** Mac ✅ working perfectly; Linux 🚧 deferred for later investigation **Date discovered:** 2026-05-20 diff --git a/src/lib/ae_api/api_patch_object.ts b/src/lib/ae_api/api_patch_object.ts index e2a62a42..d4c5f5ee 100644 --- a/src/lib/ae_api/api_patch_object.ts +++ b/src/lib/ae_api/api_patch_object.ts @@ -153,9 +153,15 @@ export const patch_object = async function patch_object({ } for (let attempt = 1; attempt <= retry_count; attempt++) { + // Keep timeout handle at attempt scope so catch can always clear it. + let timeoutId: ReturnType | null = null; try { const controller = new AbortController(); - const timeoutId = setTimeout(() => { + // AbortError alone is ambiguous. Track whether the helper timeout + // fired so we can retry timeout-aborts but fail fast on caller abort. + let did_timeout_abort = false; + timeoutId = setTimeout(() => { + did_timeout_abort = true; console.error( `API PATCH request timed out after ${timeout}ms.` ); @@ -173,12 +179,52 @@ export const patch_object = async function patch_object({ url.toString(), fetchOptions ).catch(function (error: any) { + // Keep noisy abort/network conditions out of high-level logs. + if ( + error?.name === 'AbortError' || + error?.name === 'TypeError' || + error?.message?.includes('aborted') + ) { + if (log_lvl > 1) { + console.log( + 'API PATCH: Request aborted or browser-terminated.', + error + ); + } + return error; + } + console.log( 'API PATCH Object *fetch* request was aborted or failed in an unexpected way.', error ); + return error; }); - clearTimeout(timeoutId); + if (timeoutId) clearTimeout(timeoutId); + + // Error object was returned from fetch catch block; decide retry class. + if ( + response instanceof Error || + (response && + (response.name === 'AbortError' || + response.name === 'TypeError')) + ) { + if (response.name === 'AbortError') { + // Retry only helper-timeout aborts. Caller/navigation aborts + // should fail fast to avoid duplicate mutation side-effects. + if (did_timeout_abort) { + throw new Error( + `Timeout abort (attempt ${attempt}/${retry_count}) after ${timeout}ms` + ); + } + return false; + } + + // Transient browser/network failure class. + throw new Error( + `Network error (attempt ${attempt}): ${response.message}` + ); + } if (!response) { throw new Error( @@ -292,6 +338,8 @@ export const patch_object = async function patch_object({ ? json.data : json; } catch (error) { + // Ensure per-attempt timeout is always cleared on failure. + if (timeoutId) clearTimeout(timeoutId); console.error(`API PATCH error on attempt ${attempt}:`, error); if (attempt === retry_count) { @@ -299,9 +347,12 @@ export const patch_object = async function patch_object({ return false; } - if (log_lvl) { - console.log(`Retrying... (${attempt}/${retry_count})`); - } + // Backoff before retrying. Caps at 8s to match GET/POST policy. + const delay_ms = Math.min(2000 * attempt, 8000); + console.log( + `API PATCH: Retrying in ${delay_ms}ms... (attempt ${attempt}/${retry_count})` + ); + await new Promise((resolve) => setTimeout(resolve, delay_ms)); } } };