Files
OSIT-AE-API-FastAPI/app/routers/api_v3_actions_hosted_file.py
Scott Idem b9742cfcd8 feat(routers): migrate hosted_file hash lookup to V3 actions
Ported the legacy '/hosted_file/hash/{hash}' endpoint to the V3 actions router.
The new endpoint '/v3/action/hosted_file/hash/{hosted_file_hash}' supports:
- ID Vision: returns random string IDs instead of internal integers
- Local Check: verifies physical file existence on disk (check_for_local=True)
- Deduplication: enables frontend to check for existing files before upload

Also added PROJECT document for AE Hosted Files migration tracking.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-25 13:05:09 -04:00

574 lines
24 KiB
Python

from fastapi import APIRouter, Depends, File, Form, Header, HTTPException, Path, Query, Response, status, UploadFile
from fastapi.responses import FileResponse, StreamingResponse
import aiofiles
import mimetypes
import os
import pathlib
from typing import Dict, List, Optional, Set, Union
import asyncio
import logging
from urllib.parse import quote
log = logging.getLogger(__name__)
from app.config import settings
from app.db_sql import redis_lookup_id_random, sql_select, sql_update, sql_delete, get_id_random
from app.methods.hosted_file_methods import (
create_hosted_file_obj, load_hosted_file_obj, save_file,
create_hosted_file_link, delete_hosted_file_link, get_hosted_file_link_rec_list,
lookup_file_hash, check_for_hosted_file_hash_file
)
from app.methods.lib_media import convert_file_method
from app.methods.lib_media import clip_video_method
from app.lib_general_v3 import (
AccountContext, get_account_context, get_account_context_optional,
SerializationParams, DelayParams
)
from app.models.hosted_file_models import Hosted_File_Base
from app.models.response_models import Resp_Body_Base, mk_resp
"""
Aether API V3 - Hosted File Action Router
------------------------------------------
Handles specialized binary operations like uploads, downloads, and complex deletions.
These routes complement the standard CRUD metadata routes.
"""
router = APIRouter()
# --- Helpers ---
def validate_file_extension(filename: str, allowed_extensions: List[str]):
"""
Backup check for file extensions.
"""
if not allowed_extensions:
return True
ext = filename.rsplit('.', 1)[-1].lower()
if ext not in [e.lower().strip('.') for e in allowed_extensions]:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"File extension '.{ext}' is not allowed. Allowed: {', '.join(allowed_extensions)}"
)
return True
async def file_streamer(path: str, start: int, end: int):
chunk_size = 8192 # 8KB
async with aiofiles.open(path, mode='rb') as f:
await f.seek(start)
while True:
chunk_start = await f.tell()
if chunk_start >= end:
break
bytes_to_read = min(chunk_size, end - chunk_start)
data = await f.read(bytes_to_read)
if not data:
break
yield data
# --- Routes ---
@router.post('/upload', response_model=Resp_Body_Base)
async def upload_files_action(
file_list: List[UploadFile] = File(...),
account_id: str = Form(..., min_length=11, max_length=22),
link_to_type: str = Form(...),
link_to_id: str = Form(..., min_length=11, max_length=22),
allowed_extensions: Optional[List[str]] = Query(None),
account: AccountContext = Depends(get_account_context),
delay: DelayParams = Depends(),
):
"""
V3 Enhanced Upload Action.
- Handles multiple files.
- Resolves IDs to integers.
- Deduplicates via Hash lookup.
- Returns clean Vision IDs.
"""
if delay.sleep_time_s > 0: await asyncio.sleep(delay.sleep_time_s)
# 1. Resolve Parent IDs
account_id_random = account_id
if res_account_id := redis_lookup_id_random(record_id_random=account_id, table_name='account'):
account_id_int = res_account_id
else:
raise HTTPException(status_code=400, detail="Invalid account_id.")
link_to_id_random = link_to_id
if res_link_id := redis_lookup_id_random(record_id_random=link_to_id, table_name=link_to_type):
link_to_id_int = res_link_id
else:
raise HTTPException(status_code=400, detail=f"Invalid link_to_id for type {link_to_type}.")
hosted_file_list = []
for file_obj in file_list:
# 2. Extension Validation
validate_file_extension(file_obj.filename, allowed_extensions)
# 3. Physical Save
file_info = await save_file(
file = file_obj,
account_id = account_id_int,
account_id_random = account_id_random,
link_to_type = link_to_type,
link_to_id = link_to_id_int,
link_to_id_random = link_to_id_random,
check_allowed_extension = False, # Handled by validate_file_extension above
)
if not file_info.get('saved'):
log.error(f"Failed to save file: {file_obj.filename}")
continue
hosted_file_id_int = None
hosted_file_dict = {}
# 4. Database Synchronization (Deduplication)
log.info(f"Syncing DB record for hash: {file_info['hash_sha256']}")
if existing_rec := sql_select(
table_name = 'hosted_file',
field_name = 'hash_sha256',
field_value = file_info['hash_sha256'],
):
# Use existing record
hosted_file_id_int = existing_rec.get('id')
# Migration check: Update subdirectory if missing or mismatched
if file_info.get('subdirectory_path') and existing_rec.get('subdirectory_path') != file_info['subdirectory_path']:
log.info(f"Updating subdirectory_path for existing record {hosted_file_id_int} to {file_info['subdirectory_path']}")
sql_update(
table_name = 'hosted_file',
data = {'id': hosted_file_id_int, 'subdirectory_path': file_info['subdirectory_path']}
)
# Reload to get the latest DB state (including updated path)
hosted_file_dict = load_hosted_file_obj(hosted_file_id=hosted_file_id_int, model_as_dict=True)
else:
# Create new record
file_info['account_id'] = account_id_int
file_info['account_id_random'] = account_id_random
new_hosted_file_obj = Hosted_File_Base(**file_info)
if res_new_id := create_hosted_file_obj(hosted_file_obj_new=new_hosted_file_obj):
hosted_file_id_int = res_new_id
hosted_file_dict = load_hosted_file_obj(hosted_file_id=hosted_file_id_int, model_as_dict=True)
else:
log.error("Database insertion failed for hosted_file.")
continue
# 5. Relational Linking
if hosted_file_id_int:
create_hosted_file_link(
account_id = account_id_int,
hosted_file_id = hosted_file_id_int,
link_to_type = link_to_type,
link_to_id = link_to_id_int
)
# 6. Response Preparation (Vision IDs)
# Add metadata flags
hosted_file_dict['already_exists'] = file_info.get('already_exists')
hosted_file_dict['saved'] = file_info.get('saved')
hosted_file_dict['copy_timer'] = file_info.get('copy_timer')
# Ensure ID is a random string for the frontend
if not isinstance(hosted_file_dict.get('id'), str):
rid = get_id_random(hosted_file_id_int, table_name='hosted_file')
hosted_file_dict['id'] = rid
hosted_file_dict['hosted_file_id'] = rid
hosted_file_list.append(hosted_file_dict)
return mk_resp(data=hosted_file_list, status_message=f"Successfully processed {len(hosted_file_list)} files.")
@router.get('/{hosted_file_id}/download')
async def download_file_action(
response: Response,
hosted_file_id: str = Path(min_length=11, max_length=22),
filename: Optional[str] = Query(None, min_length=4, max_length=255),
key: Optional[str] = Query(None), # Simplified unauthenticated access (Account ID)
site_key: Optional[str] = Query(None), # Bypass API Key/JWT if valid site key provided
range: Optional[str] = Header(None),
account: AccountContext = Depends(get_account_context_optional),
delay: DelayParams = Depends(),
):
"""
Enhanced download/streaming logic.
Supports byte-range seeking, delay simulation, and site_key/key bypass.
"""
if delay.sleep_time_s > 0: await asyncio.sleep(delay.sleep_time_s)
# 1. Auth Bypass Logic (site_key and simplified key)
is_authorized = False
# Priority A: Standard Auth (JWT or API Key)
if account.auth_method != 'guest':
is_authorized = True
# Priority B: Simplified Access Pattern (?key=ANY_VALID_ACCOUNT_ID)
elif key:
# For now, to unblock the frontend, any valid account_id_random is sufficient.
# Ideally, we would match it to the file's account, but that requires a DB lookup.
if redis_lookup_id_random(record_id_random=key, table_name='account'):
is_authorized = True
log.info(f"Auth Bypass: Download authorized via simplified account key.")
# Priority C: Site Key (?site_key=SITE_ACCESS_KEY)
elif site_key:
# FIX: site table uses 'access_key', not 'auth_key'
sql = "SELECT id FROM site WHERE access_key = :key AND enable = true LIMIT 1"
if site_res := sql_select(sql=sql, data={'key': site_key}):
is_authorized = True
log.info(f"Auth Bypass: Download authorized via site_key.")
if not is_authorized:
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Authentication required or invalid access key.")
# 2. Resolve File Record
# ID Vision: Attempt to resolve the ID.
# 🛑 REMINDER: If adding a new specialized 'container' object (like event_person_profile),
# ensure the lookup logic is mirrored here to allow direct downloads via container ID.
# If not found in hosted_file, check if it's an event_file or archive_content ID that we can resolve.
resolved_id = redis_lookup_id_random(record_id_random=hosted_file_id, table_name='hosted_file')
if not resolved_id:
log.info(f"ID {hosted_file_id} not found in hosted_file. Checking container tables...")
# A. Check event_file
if ef_id := redis_lookup_id_random(record_id_random=hosted_file_id, table_name='event_file'):
if ef_rec := sql_select(sql="SELECT hosted_file_id FROM event_file WHERE id = :id", data={'id': ef_id}):
resolved_id = ef_rec.get('hosted_file_id')
log.info(f"Resolved event_file {hosted_file_id} to hosted_file {resolved_id}")
# B. Check archive_content
if not resolved_id:
if ac_id := redis_lookup_id_random(record_id_random=hosted_file_id, table_name='archive_content'):
if ac_rec := sql_select(sql="SELECT hosted_file_id FROM archive_content WHERE id = :id", data={'id': ac_id}):
resolved_id = ac_rec.get('hosted_file_id')
log.info(f"Resolved archive_content {hosted_file_id} to hosted_file {resolved_id}")
if not resolved_id:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Hosted file record not found.")
hosted_file_obj = load_hosted_file_obj(hosted_file_id=resolved_id)
if not hosted_file_obj:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Hosted file data could not be loaded.")
# 3. Path Resolution
hosted_files_path = settings.FILES_PATH['hosted_files_root']
subdir_path = hosted_file_obj.subdirectory_path
hash_sha256 = hosted_file_obj.hash_sha256
hash_filename = f"{hash_sha256}.file"
if subdir_path:
full_file_path = os.path.join(hosted_files_path, subdir_path, hash_filename)
else:
full_file_path = os.path.join(hosted_files_path, hash_filename)
if not os.path.exists(full_file_path):
log.error(f"File not found on disk: {full_file_path}")
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Physical file not found on server.")
# 4. Streaming / Download logic
target_filename = filename or hosted_file_obj.filename
media_type = mimetypes.guess_type(target_filename)[0] or 'application/octet-stream'
if range:
file_size = os.stat(full_file_path).st_size
try:
range_parts = range.replace('bytes=', '').split('-')
start = int(range_parts[0])
end = int(range_parts[1]) if len(range_parts) > 1 and range_parts[1] else file_size - 1
except (ValueError, IndexError):
raise HTTPException(status_code=status.HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE)
if start >= file_size:
raise HTTPException(status_code=status.HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE)
end = min(end, file_size - 1)
content_length = end - start + 1
# ID Vision: Properly encode filename for headers to avoid UnicodeEncodeError (latin-1)
# 1. Standard filename (Sanitized for legacy clients - latin-1 safe)
safe_filename = target_filename.encode('ascii', errors='ignore').decode('ascii')
# 2. filename* (UTF-8 encoded for modern clients)
encoded_filename = quote(target_filename)
return StreamingResponse(
file_streamer(full_file_path, start, end + 1),
media_type = media_type,
status_code = status.HTTP_206_PARTIAL_CONTENT,
headers = {
'Accept-Ranges': 'bytes',
'Content-Range': f'bytes {start}-{end}/{file_size}',
'Content-Length': str(content_length),
'Content-Disposition': f'attachment; filename="{safe_filename}"; filename*=utf-8\'\'{encoded_filename}'
}
)
return FileResponse(full_file_path, filename=target_filename, media_type=media_type)
@router.get('/hash/{sha256}/download')
async def download_file_by_hash_action(
response: Response,
sha256: str = Path(min_length=64, max_length=64, regex='^[a-f0-9]{64}$'),
filename: Optional[str] = Query(None, min_length=4, max_length=255),
account: AccountContext = Depends(get_account_context_optional),
delay: DelayParams = Depends(),
):
"""
Direct hash-based download (Content-Addressable).
- Skips DB lookup for path resolution.
- Requires a valid API Key (via header or ?api_key=).
- Ideal for local caching systems like Events Launcher.
"""
if delay.sleep_time_s > 0: await asyncio.sleep(delay.sleep_time_s)
# 1. Mandatory Auth Check
# For now, we strictly require a valid machine API key (auth_method will not be 'guest')
if account.auth_method == 'guest':
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Valid API Key required for hash-based downloads."
)
# 2. Path Resolution (Deterministic)
hosted_files_path = settings.FILES_PATH['hosted_files_root']
subdir = sha256[0:2]
hash_filename = f"{sha256}.file"
full_file_path = os.path.join(hosted_files_path, subdir, hash_filename)
if not os.path.exists(full_file_path):
# Fallback to root (legacy structure)
full_file_path = os.path.join(hosted_files_path, hash_filename)
if not os.path.exists(full_file_path):
log.error(f"Hash-based file not found: {sha256}")
raise HTTPException(status_code=404, detail="File not found on server.")
# 3. Serve File
target_filename = filename or f"file_{sha256[:8]}.bin"
media_type = mimetypes.guess_type(target_filename)[0] or 'application/octet-stream'
return FileResponse(full_file_path, filename=target_filename, media_type=media_type)
@router.get('/hash/{hosted_file_hash}', response_model=Resp_Body_Base)
async def check_hosted_file_obj_w_hash_action(
response: Response,
hosted_file_hash: str = Path(min_length=64, max_length=64),
check_for_local: Optional[bool] = Query(True),
account: AccountContext = Depends(get_account_context_optional),
delay: DelayParams = Depends(),
):
"""
Look up a hosted_file record by its hash (Deduplication Check).
Optionally verifies physical file existence on disk.
"""
if delay.sleep_time_s > 0: await asyncio.sleep(delay.sleep_time_s)
if hfid := lookup_file_hash(file_hash=hosted_file_hash):
obj_model = load_hosted_file_obj(hosted_file_id=hfid, model_as_dict=False)
if not obj_model:
return mk_resp(data=False, status_code=404, response=response, status_message="Record found but data could not be loaded.")
if check_for_local:
# We use the model directly to access subdirectory_path even if it's excluded from dicts
sub_dir = getattr(obj_model, 'subdirectory_path', '') or ''
if check := check_for_hosted_file_hash_file(file_hash=hosted_file_hash, sub_dir=sub_dir):
obj_model.hosted_file_found_check = True
obj_model.hosted_file_size_check = check['file_size']
# mk_resp will handle model->dict conversion with proper ID Vision mapping
return mk_resp(data=obj_model)
return mk_resp(data=False, status_code=404, response=response, status_message="No record found for this hash.")
@router.delete('/{hosted_file_id}', response_model=Resp_Body_Base)
async def delete_file_action(
hosted_file_id: str = Path(min_length=11, max_length=22),
link_to_type: Optional[str] = Query(None),
link_to_id: Optional[str] = Query(None),
method: str = Query('hide', regex='^(hide|disable|delete)$'),
rm_orphan: bool = Query(False),
fake_delete: bool = Query(False), # Testing mode
account: AccountContext = Depends(get_account_context),
delay: DelayParams = Depends(),
):
"""
Intelligent relational deletion.
- Removes specified link.
- Counts remaining links.
- Optionally cleans up orphans.
- Supports fake_delete for testing.
"""
if delay.sleep_time_s > 0: await asyncio.sleep(delay.sleep_time_s)
# 1. Resolve IDs
file_id_int = redis_lookup_id_random(record_id_random=hosted_file_id, table_name='hosted_file')
if not file_id_int:
raise HTTPException(status_code=404, detail="Hosted file record not found.")
link_id_int = None
if link_to_type and link_to_id:
link_id_int = redis_lookup_id_random(record_id_random=link_to_id, table_name=link_to_type)
if not link_id_int:
raise HTTPException(status_code=404, detail=f"Linked object {link_to_id} not found.")
# 2. Verify State (Existence Checks)
hosted_file_obj = load_hosted_file_obj(hosted_file_id=file_id_int)
if not hosted_file_obj:
raise HTTPException(status_code=404, detail="File metadata not found.")
# Path check
hosted_files_path = settings.FILES_PATH['hosted_files_root']
file_path = os.path.join(hosted_files_path, hosted_file_obj.subdirectory_path or '', f"{hosted_file_obj.hash_sha256}.file")
file_exists_on_disk = os.path.exists(file_path)
# Link check
links = get_hosted_file_link_rec_list(hosted_file_id=file_id_int)
link_found = any(l.get('link_to_type') == link_to_type and l.get('link_to_id') == link_id_int for l in links)
if fake_delete:
log.info(f"Fake Delete active. Verifying existence...")
return mk_resp(data={
"hosted_file_exists": True,
"file_on_disk": file_exists_on_disk,
"link_exists": link_found,
"fake_delete": True
}, status_message="Fake delete successful. No data was modified.")
# 3. Execution Phase
# A. Remove the Link
if link_id_int:
delete_hosted_file_link(
account_id = account.account_id,
hosted_file_id = file_id_int,
link_to_type = link_to_type,
link_to_id = link_id_int
)
log.info(f"Deleted link between file {file_id_int} and {link_to_type}:{link_id_int}")
# B. Orphan Check & Physical Cleanup
remaining_links = get_hosted_file_link_rec_list(hosted_file_id=file_id_int)
is_orphan = (len(remaining_links) == 0)
physical_removed = False
record_removed = False
if rm_orphan and is_orphan:
log.info(f"File {file_id_int} is an orphan. Cleaning up...")
# Method Handling
if method == 'delete':
# Hard delete: Record + Disk
if file_exists_on_disk:
try:
pathlib.Path(file_path).unlink()
physical_removed = True
except OSError as e:
log.error(f"Error unlinking file {file_path}: {e}")
physical_removed = False
sql_delete(table_name='hosted_file', record_id=file_id_int)
record_removed = True
elif method == 'hide':
sql_update(table_name='hosted_file', data={'id': file_id_int, 'hide': True})
elif method == 'disable':
sql_update(table_name='hosted_file', data={'id': file_id_int, 'enable': False})
return mk_resp(data={
"link_removed": link_found if link_id_int else False,
"is_orphan": is_orphan,
"physical_removed": physical_removed,
"record_removed": record_removed,
"method": method
}, status_message="Deletion process complete.")
# ### BEGIN ### API V3 Hosted File Action ### convert_file() ###
@router.get('/{hosted_file_id}/convert_file', response_model=Resp_Body_Base)
async def convert_file(
hosted_file_id: str = Path(min_length=11, max_length=22),
link_to_type: str = Query(...),
link_to_id: str = Query(...),
filename_no_ext: str = Query('automated_hosted_file_conversion'),
to_type: str = Query('webp'),
account: AccountContext = Depends(get_account_context),
):
"""
Convert a hosted file to another format (e.g. PDF → webp image).
Runs pdf2image server-side and saves the result as a new hosted_file record
linked to the same parent object via link_to_type / link_to_id.
"""
lid_int = redis_lookup_id_random(record_id_random=link_to_id, table_name=link_to_type)
if not lid_int:
raise HTTPException(status_code=404, detail=f"Linked object not found: {link_to_type}:{link_to_id}")
result = await convert_file_method(
hosted_file_id=hosted_file_id,
link_to_type=link_to_type,
link_to_id=lid_int,
account_id=account.account_id,
account_id_random=account.account_id_random,
filename_no_ext=filename_no_ext,
to_type=to_type
)
if result:
return mk_resp(data=result)
return mk_resp(data=None, status_code=400, status_message="Conversion failed.")
# ### END ### API V3 Hosted File Action ### convert_file() ###
@router.get('/{hosted_file_id}/clip_video', response_model=Resp_Body_Base)
async def clip_video(
hosted_file_id: str = Path(min_length=11, max_length=22),
link_to_type: str = Query(...),
link_to_id: str = Query(...),
start_time: str = Query(..., min_length=8, max_length=8),
end_time: str = Query(..., min_length=8, max_length=8),
filename_no_ext: str = Query('automated_hosted_file_clip_video'),
reencode: bool = Query(False),
scale_down: bool = Query(False),
background: bool = Query(False),
account: AccountContext = Depends(get_account_context),
):
"""
Clip a segment from a hosted video and save as a new hosted_file record.
Supports optional background scheduling returning `202 Accepted` when `background=true`.
"""
lid_int = redis_lookup_id_random(record_id_random=link_to_id, table_name=link_to_type)
if not lid_int:
raise HTTPException(status_code=404, detail=f"Linked object not found: {link_to_type}:{link_to_id}")
async def _run_clip():
try:
return await clip_video_method(
hosted_file_id=hosted_file_id,
start_time=start_time,
end_time=end_time,
account_id=account.account_id,
account_id_random=account.account_id_random,
link_to_type=link_to_type,
link_to_id=lid_int,
filename_no_ext=filename_no_ext,
reencode=reencode,
scale_down=scale_down,
)
except Exception:
log.exception('Background clip task failed')
return None
if background:
# Schedule and return 202 Accepted
asyncio.create_task(_run_clip())
return mk_resp(data={'task': 'scheduled'}, status_code=202, status_message='Clip scheduled (background)')
result = await _run_clip()
if result:
return mk_resp(data=result)
return mk_resp(data=None, status_code=400, status_message="Clip failed.")
# ### END ### API V3 Hosted File Action ### clip_video() ###
# ### END ### API V3 Hosted File Action ### convert_file() ###