api: harden patch retry classification and backoff
This commit is contained in:
@@ -157,7 +157,7 @@ below. The TTL + `verify_in_flight` guards are the current mitigation.
|
||||
---
|
||||
|
||||
### [API] GET/POST retry hardening — differentiate timeout aborts vs intentional aborts
|
||||
**Status:** 🚧 Planned follow-up (2026-05-21)
|
||||
**Status:** ✅ Completed (2026-05-21)
|
||||
|
||||
Recent API helper fixes restored retry/backoff for transient network `TypeError` failures.
|
||||
Current remaining gap: timeout-triggered aborts are treated the same as intentional/user
|
||||
@@ -165,13 +165,15 @@ aborts, so retries are skipped in both `api_get_object.ts` and `api_post_object.
|
||||
|
||||
**Decision (for now):** Keep the global default timeout at **20s**.
|
||||
|
||||
**What needs to be implemented:**
|
||||
- Separate abort reasons in GET/POST helpers:
|
||||
**Implemented:**
|
||||
- GET/POST now explicitly distinguish abort class in helper code:
|
||||
- **Intentional abort** (navigation/unmount/caller cancel): fail fast, no retry
|
||||
- **Timeout abort** (helper's own timer): eligible for retry/backoff (same class as transient network)
|
||||
- Add explicit timeout classification in code (not just `AbortError` name check), so the retry
|
||||
loop can make a deterministic decision.
|
||||
- Keep existing capped backoff behavior (`2s -> 4s -> 6s -> 8s`) for retryable timeout/network failures.
|
||||
- **Timeout abort** (helper timer): retryable via existing retry loop
|
||||
- Timeout classification added with per-attempt timeout flag (not `AbortError` name-only logic).
|
||||
- Backoff behavior retained for retryable failures (`2s -> 4s -> 6s -> 8s`, cap 8s).
|
||||
- Validation done:
|
||||
- `npx svelte-check` clean
|
||||
- API Playwright tests updated/fixed and passing (`v3_api_security.modern`, `v3_api_nested_crud`)
|
||||
|
||||
**Timeout policy improvement (class-based):**
|
||||
- Keep **20s default** as baseline.
|
||||
@@ -195,6 +197,51 @@ aborts, so retries are skipped in both `api_get_object.ts` and `api_post_object.
|
||||
|
||||
---
|
||||
|
||||
### [API] PATCH/DELETE retry hardening — parity with GET/POST
|
||||
**Status:** 🚧 In progress (PATCH first, then DELETE)
|
||||
|
||||
Current behavior in `api_patch_object.ts` and `api_delete_object.ts` has retry loops,
|
||||
but does not yet have GET/POST parity for abort classification and backoff policy.
|
||||
|
||||
**Plan (sequenced):**
|
||||
- **Step 1 (now): PATCH parity**
|
||||
- Add timeout-vs-intentional abort separation.
|
||||
- Retry only timeout/network transient class.
|
||||
- Keep fail-fast behavior for 400/401/403/422.
|
||||
- Add capped backoff (`2s -> 4s -> 6s -> 8s`).
|
||||
- **Step 2 (after PATCH validation): DELETE parity**
|
||||
- Apply same classification and backoff strategy.
|
||||
- Preserve existing delete semantics for client/auth failures.
|
||||
|
||||
**Mutation safety note:**
|
||||
- PATCH/DELETE can have ambiguous commit state on timeout. Current policy is conservative:
|
||||
retries target obvious transient failure class (timeout/network), while caller aborts remain
|
||||
fail-fast to avoid duplicate side effects during navigation/unmount flows.
|
||||
|
||||
**Acceptance criteria:**
|
||||
- PATCH and DELETE timeout-aborts retry under capped backoff.
|
||||
- Caller/navigation aborts do not retry.
|
||||
- No regression for 400/401/403/422 fail-fast behavior.
|
||||
- `npx svelte-check` clean, API-focused Playwright tests remain green.
|
||||
|
||||
---
|
||||
|
||||
### [Testing] V3 API performance probe (basic stress rounds)
|
||||
**Status:** ✅ Completed baseline harness (2026-05-21)
|
||||
|
||||
Implemented a gated Playwright probe for quick repeated list-query timing against live V3 endpoints.
|
||||
|
||||
**Files:**
|
||||
- `tests/v3_api_latency_probe.test.ts`
|
||||
- `tests/README.md` (run/tuning docs)
|
||||
|
||||
**Current capabilities:**
|
||||
- Measures rounds for event sessions, journal entries, and user lists.
|
||||
- Writes per-run JSON + Markdown reports to `tests/results/`.
|
||||
- Optional anomaly thresholds for error-rate / p95 / empty-row detection.
|
||||
|
||||
---
|
||||
|
||||
### [Launcher/VLC] Linux playback — fullscreen + pause-on-end not working
|
||||
**Status:** Mac ✅ working perfectly; Linux 🚧 deferred for later investigation
|
||||
**Date discovered:** 2026-05-20
|
||||
|
||||
@@ -153,9 +153,15 @@ export const patch_object = async function patch_object({
|
||||
}
|
||||
|
||||
for (let attempt = 1; attempt <= retry_count; attempt++) {
|
||||
// Keep timeout handle at attempt scope so catch can always clear it.
|
||||
let timeoutId: ReturnType<typeof setTimeout> | null = null;
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => {
|
||||
// AbortError alone is ambiguous. Track whether the helper timeout
|
||||
// fired so we can retry timeout-aborts but fail fast on caller abort.
|
||||
let did_timeout_abort = false;
|
||||
timeoutId = setTimeout(() => {
|
||||
did_timeout_abort = true;
|
||||
console.error(
|
||||
`API PATCH request timed out after ${timeout}ms.`
|
||||
);
|
||||
@@ -173,12 +179,52 @@ export const patch_object = async function patch_object({
|
||||
url.toString(),
|
||||
fetchOptions
|
||||
).catch(function (error: any) {
|
||||
// Keep noisy abort/network conditions out of high-level logs.
|
||||
if (
|
||||
error?.name === 'AbortError' ||
|
||||
error?.name === 'TypeError' ||
|
||||
error?.message?.includes('aborted')
|
||||
) {
|
||||
if (log_lvl > 1) {
|
||||
console.log(
|
||||
'API PATCH: Request aborted or browser-terminated.',
|
||||
error
|
||||
);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
console.log(
|
||||
'API PATCH Object *fetch* request was aborted or failed in an unexpected way.',
|
||||
error
|
||||
);
|
||||
return error;
|
||||
});
|
||||
clearTimeout(timeoutId);
|
||||
if (timeoutId) clearTimeout(timeoutId);
|
||||
|
||||
// Error object was returned from fetch catch block; decide retry class.
|
||||
if (
|
||||
response instanceof Error ||
|
||||
(response &&
|
||||
(response.name === 'AbortError' ||
|
||||
response.name === 'TypeError'))
|
||||
) {
|
||||
if (response.name === 'AbortError') {
|
||||
// Retry only helper-timeout aborts. Caller/navigation aborts
|
||||
// should fail fast to avoid duplicate mutation side-effects.
|
||||
if (did_timeout_abort) {
|
||||
throw new Error(
|
||||
`Timeout abort (attempt ${attempt}/${retry_count}) after ${timeout}ms`
|
||||
);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Transient browser/network failure class.
|
||||
throw new Error(
|
||||
`Network error (attempt ${attempt}): ${response.message}`
|
||||
);
|
||||
}
|
||||
|
||||
if (!response) {
|
||||
throw new Error(
|
||||
@@ -292,6 +338,8 @@ export const patch_object = async function patch_object({
|
||||
? json.data
|
||||
: json;
|
||||
} catch (error) {
|
||||
// Ensure per-attempt timeout is always cleared on failure.
|
||||
if (timeoutId) clearTimeout(timeoutId);
|
||||
console.error(`API PATCH error on attempt ${attempt}:`, error);
|
||||
|
||||
if (attempt === retry_count) {
|
||||
@@ -299,9 +347,12 @@ export const patch_object = async function patch_object({
|
||||
return false;
|
||||
}
|
||||
|
||||
if (log_lvl) {
|
||||
console.log(`Retrying... (${attempt}/${retry_count})`);
|
||||
}
|
||||
// Backoff before retrying. Caps at 8s to match GET/POST policy.
|
||||
const delay_ms = Math.min(2000 * attempt, 8000);
|
||||
console.log(
|
||||
`API PATCH: Retrying in ${delay_ms}ms... (attempt ${attempt}/${retry_count})`
|
||||
);
|
||||
await new Promise<void>((resolve) => setTimeout(resolve, delay_ms));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user