Add output cleanup policy

This commit is contained in:
Alfredo Di Stasio
2026-04-27 14:17:44 +02:00
parent 93cebeb002
commit b8069d6771
9 changed files with 227 additions and 4 deletions

View File

@@ -1,6 +1,7 @@
import json
import uuid
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
from app.services.exporter import ExportResult, write_export
@@ -14,6 +15,12 @@ class ResultMetadata:
mimetype: str
def _result_paths(output_dir: Path, result_id: str) -> tuple[Path, Path]:
"""Build the sidecar metadata and output file search pattern for a result id."""
metadata_path = output_dir / f"{result_id}.json"
return metadata_path, output_dir / f"{result_id}"
def persist_result(
output_dir: Path,
records: list[dict[str, str]],
@@ -50,8 +57,53 @@ def persist_result(
def load_result_metadata(output_dir: Path, result_id: str) -> dict[str, str] | None:
"""Load sidecar metadata for a generated file."""
metadata_path = output_dir / f"{result_id}.json"
metadata_path, _base_path = _result_paths(output_dir, result_id)
if not metadata_path.exists():
return None
return json.loads(metadata_path.read_text(encoding="utf-8"))
def delete_result_files(output_dir: Path, result_id: str) -> None:
"""Delete a generated output file and its metadata sidecar if they still exist."""
metadata_path, base_path = _result_paths(output_dir, result_id)
for output_file in output_dir.glob(f"{base_path.name}.*"):
if output_file.name == metadata_path.name:
continue
output_file.unlink(missing_ok=True)
metadata_path.unlink(missing_ok=True)
def cleanup_expired_outputs(output_dir: Path, retention_hours: int) -> int:
"""Delete generated output sets older than the configured retention window."""
cutoff = datetime.now(timezone.utc) - timedelta(hours=retention_hours)
deleted_results = 0
for metadata_path in output_dir.glob("*.json"):
try:
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
payload = {}
result_id = payload.get("result_id") or metadata_path.stem
file_path = Path(payload["file_path"]) if "file_path" in payload else None
newest_mtime = _newest_mtime(metadata_path, file_path)
if newest_mtime is None or newest_mtime >= cutoff:
continue
delete_result_files(output_dir=output_dir, result_id=result_id)
deleted_results += 1
return deleted_results
def _newest_mtime(metadata_path: Path, file_path: Path | None) -> datetime | None:
"""Return the newest modification time across the metadata and output file."""
mtimes: list[datetime] = []
if metadata_path.exists():
mtimes.append(datetime.fromtimestamp(metadata_path.stat().st_mtime, tz=timezone.utc))
if file_path is not None and file_path.exists():
mtimes.append(datetime.fromtimestamp(file_path.stat().st_mtime, tz=timezone.utc))
if not mtimes:
return None
return max(mtimes)