Merge branch 'feature/output-cleanup-policy' into develop

This commit is contained in:
Alfredo Di Stasio
2026-04-27 14:18:03 +02:00
9 changed files with 227 additions and 4 deletions

View File

@@ -91,6 +91,9 @@ Compose settings are stored in `env`. Update that file to change values such as:
- `SECRET_KEY` - `SECRET_KEY`
- `MAX_UPLOAD_SIZE_MB` - `MAX_UPLOAD_SIZE_MB`
- `OUTPUT_DIRECTORY` - `OUTPUT_DIRECTORY`
- `OUTPUT_RETENTION_HOURS`
- `CLEANUP_ON_STARTUP`
- `CLEANUP_AFTER_DOWNLOAD`
### Run the test suite in a container ### Run the test suite in a container
@@ -126,6 +129,7 @@ curl -X POST http://127.0.0.1:5000/convert \
## Notes ## Notes
- Temporary output files are written to `instance/outputs` - Temporary output files are written to `instance/outputs`
- Generated files are cleaned up according to the configured output retention policy
- The application does not require a database - The application does not require a database
- Gunicorn is used as the production WSGI server - Gunicorn is used as the production WSGI server
- Parsing and export writing are streamed to reduce memory usage on large uploads - Parsing and export writing are streamed to reduce memory usage on large uploads
@@ -133,3 +137,6 @@ curl -X POST http://127.0.0.1:5000/convert \
- Default upload limit is 100 MiB - Default upload limit is 100 MiB
- Set `MAX_UPLOAD_SIZE_MB` to configure the upload limit in megabytes - Set `MAX_UPLOAD_SIZE_MB` to configure the upload limit in megabytes
- `MAX_CONTENT_LENGTH` is also supported as a lower-level byte-based override - `MAX_CONTENT_LENGTH` is also supported as a lower-level byte-based override
- `OUTPUT_RETENTION_HOURS` controls how long generated output files are kept
- `CLEANUP_ON_STARTUP=true` removes expired generated files when the app starts
- `CLEANUP_AFTER_DOWNLOAD=true` deletes a result only after the response finishes sending

View File

@@ -5,6 +5,7 @@ from werkzeug.exceptions import RequestEntityTooLarge
from app.config import Config from app.config import Config
from app.routes import main_blueprint from app.routes import main_blueprint
from app.services.storage import cleanup_expired_outputs
def _format_size_limit(size_limit_bytes: int) -> str: def _format_size_limit(size_limit_bytes: int) -> str:
@@ -27,6 +28,12 @@ def create_app(config_class: type[Config] = Config) -> Flask:
app.config["OUTPUT_DIRECTORY"] = output_dir app.config["OUTPUT_DIRECTORY"] = output_dir
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
if app.config.get("CLEANUP_ON_STARTUP", False):
cleanup_expired_outputs(
output_dir=output_dir,
retention_hours=app.config.get("OUTPUT_RETENTION_HOURS", 24),
)
app.register_blueprint(main_blueprint) app.register_blueprint(main_blueprint)
@app.errorhandler(RequestEntityTooLarge) @app.errorhandler(RequestEntityTooLarge)

View File

@@ -2,6 +2,14 @@ import os
from pathlib import Path from pathlib import Path
def _get_bool_setting(name: str, default: bool) -> bool:
"""Parse conventional boolean environment values."""
value = os.environ.get(name)
if value is None:
return default
return value.strip().lower() in {"1", "true", "yes", "on"}
def _get_max_content_length() -> int: def _get_max_content_length() -> int:
"""Resolve the upload size limit from environment settings.""" """Resolve the upload size limit from environment settings."""
upload_limit_mb = os.environ.get("MAX_UPLOAD_SIZE_MB") upload_limit_mb = os.environ.get("MAX_UPLOAD_SIZE_MB")
@@ -25,3 +33,6 @@ class Config:
OUTPUT_DIRECTORY = Path( OUTPUT_DIRECTORY = Path(
os.environ.get("OUTPUT_DIRECTORY", Path("instance") / "outputs") os.environ.get("OUTPUT_DIRECTORY", Path("instance") / "outputs")
) )
OUTPUT_RETENTION_HOURS = int(os.environ.get("OUTPUT_RETENTION_HOURS", 24))
CLEANUP_ON_STARTUP = _get_bool_setting("CLEANUP_ON_STARTUP", True)
CLEANUP_AFTER_DOWNLOAD = _get_bool_setting("CLEANUP_AFTER_DOWNLOAD", False)

View File

@@ -12,12 +12,13 @@ from flask import (
url_for, url_for,
) )
from werkzeug.datastructures import FileStorage from werkzeug.datastructures import FileStorage
from werkzeug.wsgi import ClosingIterator
from app.constants import MODES, OUTPUT_FORMATS, SORTABLE_FIELDS, SORT_ORDERS from app.constants import MODES, OUTPUT_FORMATS, SORTABLE_FIELDS, SORT_ORDERS
from app.services.conversion import convert_uploaded_log from app.services.conversion import convert_uploaded_log
from app.services.parser import LogParseError from app.services.parser import LogParseError
from app.services.processing import ProcessingError, ProcessingOptions from app.services.processing import ProcessingError, ProcessingOptions
from app.services.storage import load_result_metadata from app.services.storage import delete_result_files, load_result_metadata
main_blueprint = Blueprint("main", __name__) main_blueprint = Blueprint("main", __name__)
@@ -139,10 +140,17 @@ def download(result_id: str):
flash("Requested output file could not be found.", "danger") flash("Requested output file could not be found.", "danger")
return redirect(url_for("main.index")) return redirect(url_for("main.index"))
return send_file( response = send_file(
Path(metadata["file_path"]), Path(metadata["file_path"]),
as_attachment=True, as_attachment=True,
download_name=metadata["download_name"], download_name=metadata["download_name"],
mimetype=metadata["mimetype"], mimetype=metadata["mimetype"],
max_age=0, max_age=0,
) )
if current_app.config.get("CLEANUP_AFTER_DOWNLOAD", False):
output_dir = current_app.config["OUTPUT_DIRECTORY"]
response.response = ClosingIterator(
response.response,
[lambda: delete_result_files(output_dir=output_dir, result_id=result_id)],
)
return response

View File

@@ -1,6 +1,7 @@
import json import json
import uuid import uuid
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path from pathlib import Path
from app.services.exporter import ExportResult, write_export from app.services.exporter import ExportResult, write_export
@@ -14,6 +15,12 @@ class ResultMetadata:
mimetype: str mimetype: str
def _result_paths(output_dir: Path, result_id: str) -> tuple[Path, Path]:
"""Build the sidecar metadata and output file search pattern for a result id."""
metadata_path = output_dir / f"{result_id}.json"
return metadata_path, output_dir / f"{result_id}"
def persist_result( def persist_result(
output_dir: Path, output_dir: Path,
records: list[dict[str, str]], records: list[dict[str, str]],
@@ -50,8 +57,53 @@ def persist_result(
def load_result_metadata(output_dir: Path, result_id: str) -> dict[str, str] | None: def load_result_metadata(output_dir: Path, result_id: str) -> dict[str, str] | None:
"""Load sidecar metadata for a generated file.""" """Load sidecar metadata for a generated file."""
metadata_path = output_dir / f"{result_id}.json" metadata_path, _base_path = _result_paths(output_dir, result_id)
if not metadata_path.exists(): if not metadata_path.exists():
return None return None
return json.loads(metadata_path.read_text(encoding="utf-8")) return json.loads(metadata_path.read_text(encoding="utf-8"))
def delete_result_files(output_dir: Path, result_id: str) -> None:
"""Delete a generated output file and its metadata sidecar if they still exist."""
metadata_path, base_path = _result_paths(output_dir, result_id)
for output_file in output_dir.glob(f"{base_path.name}.*"):
if output_file.name == metadata_path.name:
continue
output_file.unlink(missing_ok=True)
metadata_path.unlink(missing_ok=True)
def cleanup_expired_outputs(output_dir: Path, retention_hours: int) -> int:
"""Delete generated output sets older than the configured retention window."""
cutoff = datetime.now(timezone.utc) - timedelta(hours=retention_hours)
deleted_results = 0
for metadata_path in output_dir.glob("*.json"):
try:
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
payload = {}
result_id = payload.get("result_id") or metadata_path.stem
file_path = Path(payload["file_path"]) if "file_path" in payload else None
newest_mtime = _newest_mtime(metadata_path, file_path)
if newest_mtime is None or newest_mtime >= cutoff:
continue
delete_result_files(output_dir=output_dir, result_id=result_id)
deleted_results += 1
return deleted_results
def _newest_mtime(metadata_path: Path, file_path: Path | None) -> datetime | None:
"""Return the newest modification time across the metadata and output file."""
mtimes: list[datetime] = []
if metadata_path.exists():
mtimes.append(datetime.fromtimestamp(metadata_path.stat().st_mtime, tz=timezone.utc))
if file_path is not None and file_path.exists():
mtimes.append(datetime.fromtimestamp(file_path.stat().st_mtime, tz=timezone.utc))
if not mtimes:
return None
return max(mtimes)

3
env
View File

@@ -1,3 +1,6 @@
SECRET_KEY=change-me SECRET_KEY=change-me
MAX_UPLOAD_SIZE_MB=120 MAX_UPLOAD_SIZE_MB=120
OUTPUT_DIRECTORY=/app/instance/outputs OUTPUT_DIRECTORY=/app/instance/outputs
OUTPUT_RETENTION_HOURS=24
CLEANUP_ON_STARTUP=true
CLEANUP_AFTER_DOWNLOAD=false

View File

@@ -12,6 +12,9 @@ class TestConfig:
MAX_CONTENT_LENGTH = 100 * 1024 * 1024 MAX_CONTENT_LENGTH = 100 * 1024 * 1024
PREVIEW_RECORD_LIMIT = 5 PREVIEW_RECORD_LIMIT = 5
OUTPUT_DIRECTORY = "test-outputs" OUTPUT_DIRECTORY = "test-outputs"
OUTPUT_RETENTION_HOURS = 24
CLEANUP_ON_STARTUP = False
CLEANUP_AFTER_DOWNLOAD = False
@pytest.fixture() @pytest.fixture()

View File

@@ -1,4 +1,6 @@
import io import io
import json
from pathlib import Path
from app import create_app from app import create_app
@@ -132,6 +134,95 @@ def test_download_route_returns_generated_file(client):
download_response.close() download_response.close()
def test_download_route_can_cleanup_files_after_download(tmp_path):
class CleanupAfterDownloadConfig:
TESTING = True
SECRET_KEY = "test-secret"
MAX_CONTENT_LENGTH = 100 * 1024 * 1024
PREVIEW_RECORD_LIMIT = 5
OUTPUT_DIRECTORY = tmp_path / "download-cleanup-outputs"
OUTPUT_RETENTION_HOURS = 24
CLEANUP_ON_STARTUP = False
CLEANUP_AFTER_DOWNLOAD = True
app = create_app(CleanupAfterDownloadConfig)
client = app.test_client()
log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8"))
convert_response = client.post(
"/convert",
data={
"mode": "vendor",
"output_format": "csv",
"sort_by": "datetime",
"order": "asc",
"policy_cs": "",
"policy_ci": "",
"severity_cs": "",
"severity_ci": "",
"log_file": (log_file, "sample.log"),
},
content_type="multipart/form-data",
)
log_file.close()
html = convert_response.data.decode("utf-8")
marker = "/download/"
start = html.index(marker) + len(marker)
end = html.index('"', start)
result_id = html[start:end]
metadata_path = Path(app.config["OUTPUT_DIRECTORY"]) / f"{result_id}.json"
download_response = client.get(f"/download/{result_id}")
download_response.close()
convert_response.close()
assert not metadata_path.exists()
def test_cleanup_on_startup_removes_expired_outputs(tmp_path):
output_dir = tmp_path / "startup-cleanup-outputs"
output_dir.mkdir(parents=True)
result_id = "expired-result"
file_path = output_dir / f"{result_id}.csv"
metadata_path = output_dir / f"{result_id}.json"
file_path.write_text("header\nvalue\n", encoding="utf-8")
metadata_path.write_text(
json.dumps(
{
"result_id": result_id,
"file_path": str(file_path),
"download_name": "waf-report.csv",
"mimetype": "text/csv; charset=utf-8",
}
),
encoding="utf-8",
)
old_timestamp = 946684800
file_path.touch()
metadata_path.touch()
Path(file_path).touch()
import os
os.utime(file_path, (old_timestamp, old_timestamp))
os.utime(metadata_path, (old_timestamp, old_timestamp))
class StartupCleanupConfig:
TESTING = True
SECRET_KEY = "test-secret"
MAX_CONTENT_LENGTH = 100 * 1024 * 1024
PREVIEW_RECORD_LIMIT = 5
OUTPUT_DIRECTORY = output_dir
OUTPUT_RETENTION_HOURS = 1
CLEANUP_ON_STARTUP = True
CLEANUP_AFTER_DOWNLOAD = False
create_app(StartupCleanupConfig)
assert not file_path.exists()
assert not metadata_path.exists()
def test_default_upload_limit_is_100_mib(app): def test_default_upload_limit_is_100_mib(app):
assert app.config["MAX_CONTENT_LENGTH"] == 100 * 1024 * 1024 assert app.config["MAX_CONTENT_LENGTH"] == 100 * 1024 * 1024

View File

@@ -1,6 +1,8 @@
import json
import os
from pathlib import Path from pathlib import Path
from app.services.storage import persist_result from app.services.storage import cleanup_expired_outputs, delete_result_files, persist_result
def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path): def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path):
@@ -32,3 +34,42 @@ def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path):
assert "v015xxxxdate,time,policy,severity_level" in written assert "v015xxxxdate,time,policy,severity_level" in written
assert "2024-05-01,10:00:00,Prod Policy,high" in written assert "2024-05-01,10:00:00,Prod Policy,high" in written
assert export_result.preview(1).count("\n") == 1 assert export_result.preview(1).count("\n") == 1
def test_delete_result_files_removes_output_and_metadata(tmp_path: Path):
result_id = "delete-me"
output_file = tmp_path / f"{result_id}.txt"
metadata_file = tmp_path / f"{result_id}.json"
output_file.write_text("content", encoding="utf-8")
metadata_file.write_text("{}", encoding="utf-8")
delete_result_files(output_dir=tmp_path, result_id=result_id)
assert not output_file.exists()
assert not metadata_file.exists()
def test_cleanup_expired_outputs_removes_only_old_results(tmp_path: Path):
old_result_id = "old-result"
new_result_id = "new-result"
old_output = tmp_path / f"{old_result_id}.csv"
old_metadata = tmp_path / f"{old_result_id}.json"
new_output = tmp_path / f"{new_result_id}.csv"
new_metadata = tmp_path / f"{new_result_id}.json"
old_output.write_text("old", encoding="utf-8")
new_output.write_text("new", encoding="utf-8")
old_metadata.write_text(json.dumps({"result_id": old_result_id, "file_path": str(old_output)}), encoding="utf-8")
new_metadata.write_text(json.dumps({"result_id": new_result_id, "file_path": str(new_output)}), encoding="utf-8")
old_timestamp = 946684800
os.utime(old_output, (old_timestamp, old_timestamp))
os.utime(old_metadata, (old_timestamp, old_timestamp))
deleted_results = cleanup_expired_outputs(output_dir=tmp_path, retention_hours=1)
assert deleted_results == 1
assert not old_output.exists()
assert not old_metadata.exists()
assert new_output.exists()
assert new_metadata.exists()