feat(event_file): add atomic DELETE and hosted_file orphan scan

DELETE /v3/action/event_file/{id} — removes hosted_file_link, optionally cleans up physical file + hosted_file record if orphaned (rm_orphan=True default), then deletes the event_file row. Closes the gap left by the V3 CRUD migration which silently dropped hosted_file cleanup. GET /v3/action/hosted_file/orphan_scan — returns hosted_file rows with no hosted_file_link entries (DB orphans, paginated), plus optional disk scan for physical files with no DB record. Needed for admin cleanup of the backlog accumulated during the broken-delete period. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-18 18:09:20 -04:00
parent 2b1044eebb
commit 88f7609b63
2 changed files with 110 additions and 2 deletions
--- a/app/routers/api_v3_actions_hosted_file.py
+++ b/app/routers/api_v3_actions_hosted_file.py
@@ -370,6 +370,71 @@ async def check_hosted_file_obj_w_hash_action(
    return mk_resp(data=False, status_code=404, response=response, status_message="No record found for this hash.")


+@router.get('/orphan_scan', response_model=Resp_Body_Base)
+async def orphan_scan_action(
+        include_disk_orphans: bool = Query(False),
+        limit: int = Query(500, ge=1, le=5000),
+        offset: int = Query(0, ge=0),
+        account: AccountContext = Depends(get_account_context),
+        ):
+    """
+    Admin: find hosted_file records with no hosted_file_link entries (DB orphans),
+    and optionally physical files on disk with no hosted_file DB record (disk orphans).
+    Use limit/offset to page through large backlogs.
+    """
+    db_orphan_sql = """
+        SELECT hf.id, hf.id_random, hf.filename, hf.hash_sha256,
+               hf.subdirectory_path, hf.size, hf.content_type, hf.created_on
+        FROM hosted_file hf
+        LEFT JOIN hosted_file_link hfl ON hfl.hosted_file_id = hf.id
+        WHERE hfl.id IS NULL
+          AND (hf.hide IS NULL OR hf.hide != 1)
+        ORDER BY hf.created_on ASC
+        LIMIT :limit OFFSET :offset
+    """
+    raw = sql_select(sql=db_orphan_sql, data={'limit': limit, 'offset': offset}, as_list=True) or []
+
+    db_orphans = []
+    for row in raw:
+        db_orphans.append({
+            'hosted_file_id': row.get('id_random') or get_id_random(row['id'], 'hosted_file'),
+            'filename': row.get('filename'),
+            'hash_sha256': row.get('hash_sha256'),
+            'subdirectory_path': row.get('subdirectory_path'),
+            'size': row.get('size'),
+            'content_type': row.get('content_type'),
+            'created_on': str(row.get('created_on', '')),
+        })
+
+    result = {
+        'db_orphans': db_orphans,
+        'db_orphan_count': len(db_orphans),
+        'disk_orphans': [],
+        'disk_orphan_count': 0,
+    }
+
+    if include_disk_orphans:
+        hosted_files_root = settings.FILES_PATH['hosted_files_root']
+        all_db_hashes_raw = sql_select(sql="SELECT hash_sha256 FROM hosted_file", as_list=True) or []
+        all_db_hashes = {r['hash_sha256'] for r in all_db_hashes_raw}
+
+        disk_orphans = []
+        for dirpath, _, filenames in os.walk(hosted_files_root):
+            for fname in filenames:
+                if fname.endswith('.file'):
+                    sha256 = fname[:-5]
+                    if sha256 not in all_db_hashes:
+                        disk_orphans.append({
+                            'hash_sha256': sha256,
+                            'path': os.path.join(dirpath, fname),
+                        })
+
+        result['disk_orphans'] = disk_orphans
+        result['disk_orphan_count'] = len(disk_orphans)
+
+    return mk_resp(data=result)
+
+
@router.get('/{hosted_file_id}/links', response_model=Resp_Body_Base)
 async def get_file_links_action(
        hosted_file_id: str = Path(min_length=11, max_length=22),