From a56213569a5a6121ae079051c3de6708dcc7ffe6 Mon Sep 17 00:00:00 2001
From: Scott Idem <stidem@gmail.com>
Date: Fri, 17 Apr 2026 18:55:28 -0400
Subject: [PATCH] docs: expand .env.default comments for API and DB tuning
 settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updated AE_API_GUNICORN_WORKERS default from 2 → 4 based on stress
testing (nearly 2x throughput improvement confirmed). Added detailed
comments to Gunicorn, DB pool, and connection tuning settings explaining
what each parameter does, how they interact, and capacity planning math.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .env.default | 55 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 10 deletions(-)

diff --git a/.env.default b/.env.default
index 2ea4d63..1ff0207 100644
--- a/.env.default
+++ b/.env.default
@@ -79,11 +79,25 @@ AE_DB_USERNAME=aether_dev
 AE_DB_PASSWORD=XXXX
 
 # Connection Tuning
+# Seconds to wait when establishing a new connection before giving up.
+# Lower values fail fast on DB outage rather than hanging requests.
 AE_DB_CONNECTION_TIMEOUT=7
+
+# Seconds before a pooled connection is recycled (closed and reopened).
+# Prevents "MySQL server has gone away" errors from MariaDB's wait_timeout.
+# Must be less than MariaDB's wait_timeout (default 28800s / 8 hours).
+# 900s (15 min) is a safe conservative value for active workloads.
 AE_DB_POOL_RECYCLE=900
-# Pool size per API replica. Total max DB connections = AE_API_REPLICAS × (AE_DB_POOL_SIZE + AE_DB_POOL_MAX_OVERFLOW)
-# With defaults (10+20) and 3 replicas = 90 max connections. MARIADB_MAX_CONNECTIONS must be higher.
+
+# Connections held open per API replica at idle (the "warm" pool).
+# Each replica maintains this many persistent connections to MariaDB.
 AE_DB_POOL_SIZE=10
+
+# Additional connections a replica can open beyond AE_DB_POOL_SIZE under burst load.
+# These are created on demand and closed when the burst subsides.
+# Max connections per replica = AE_DB_POOL_SIZE + AE_DB_POOL_MAX_OVERFLOW.
+# Total max DB connections across all replicas = AE_API_REPLICAS × (AE_DB_POOL_SIZE + AE_DB_POOL_MAX_OVERFLOW).
+# Example: 3 replicas × (10 + 20) = 90 max connections. MARIADB_MAX_CONNECTIONS must exceed this.
 AE_DB_POOL_MAX_OVERFLOW=20
 
 # ------------------------------------------------------------------------------
@@ -96,23 +110,44 @@ AE_REDIS_PORT=6379
 # ------------------------------------------------------------------------------
 # API SETTINGS (FastAPI)
 # ------------------------------------------------------------------------------
-# Number of API container instances to run (Docker Compose scaling)
+
+# Number of API container instances (Docker Compose replica scaling).
+# Each replica is an independent container with its own Gunicorn process and
+# connection pool. Total DB connections = AE_API_REPLICAS × (AE_DB_POOL_SIZE + AE_DB_POOL_MAX_OVERFLOW).
+# Increase for horizontal scaling across CPU cores. On a single-node Linode,
+# 2-4 replicas is typical; more replicas won't help if the DB is the bottleneck.
 AE_API_REPLICAS=3
 
-# Gunicorn / Uvicorn Tuning
-# AE_API_GUNICORN_TIMEOUT: worker timeout in seconds. Default in gunicorn_conf.py
-# is 120s. Raise for endpoints that run long ffmpeg operations (clip_video, etc.)
-# The dev .env typically sets this to 900 to accommodate 5-15 min video jobs.
+# --- Gunicorn / Uvicorn Tuning ---
+
+# Internal port Gunicorn listens on inside the container. Nginx proxies to this.
+# Each replica uses this same port within its own network namespace.
 AE_API_GUNICORN_PORT=5065
+
+# Worker timeout in seconds. A request that takes longer than this causes Gunicorn
+# to kill and restart the worker. Default in gunicorn_conf.py is 120s.
+# Raise for endpoints that run long ffmpeg operations (clip_video, convert_file, etc.).
+# Dev typically uses 900s to accommodate 5-15 min video jobs.
 AE_API_GUNICORN_TIMEOUT=900
-AE_API_GUNICORN_WORKERS=2
+
+# Uvicorn worker processes per replica. Each worker handles requests independently
+# using async I/O, but SQLAlchemy DB calls are synchronous and block the worker.
+# More workers = more parallel DB queries. Recommended: 2-4 per replica.
+# Total parallel DB query capacity ≈ AE_API_REPLICAS × AE_API_GUNICORN_WORKERS.
+# Stress testing at 4 workers/replica yielded ~2x throughput vs 2 workers (14 req/s vs 7.5 req/s).
+# Rule of thumb: (2 × CPU cores) + 1 per replica, but DB throughput caps before CPU becomes the limit.
+AE_API_GUNICORN_WORKERS=4
+
+# Threads per Gunicorn worker. Uvicorn workers use async I/O, so threading provides
+# minimal benefit here. Leave at 1 unless explicitly benchmarked otherwise.
 AE_API_GUNICORN_THREADS=1
 
 # Security & CORS
-# JWT_KEY should be a 22+ character secret string
+# JWT_KEY should be a 22+ character secret string. Rotate if compromised.
 AE_API_JWT_KEY=XXXX
 
-# Regex for allowed CORS origins
+# Regex for allowed CORS origins. Requests from non-matching origins are blocked.
+# Extend the pattern if adding new domains or local dev ports.
 AE_API_ORIGINS_REGEX="(https://.*\.oneskyit\.com)|(http://.*\.oneskyit\.com)|(http://.*.localhost)|(http://.*.localhost:5173)"
 
 # ------------------------------------------------------------------------------