diff --git a/README.md b/README.md index e56b9e4..c409748 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,9 @@ Compose settings are stored in `env`. Update that file to change values such as: - `SECRET_KEY` - `MAX_UPLOAD_SIZE_MB` - `OUTPUT_DIRECTORY` +- `OUTPUT_RETENTION_HOURS` +- `CLEANUP_ON_STARTUP` +- `CLEANUP_AFTER_DOWNLOAD` ### Run the test suite in a container @@ -126,6 +129,7 @@ curl -X POST http://127.0.0.1:5000/convert \ ## Notes - Temporary output files are written to `instance/outputs` +- Generated files are cleaned up according to the configured output retention policy - The application does not require a database - Gunicorn is used as the production WSGI server - Parsing and export writing are streamed to reduce memory usage on large uploads @@ -133,3 +137,6 @@ curl -X POST http://127.0.0.1:5000/convert \ - Default upload limit is 100 MiB - Set `MAX_UPLOAD_SIZE_MB` to configure the upload limit in megabytes - `MAX_CONTENT_LENGTH` is also supported as a lower-level byte-based override +- `OUTPUT_RETENTION_HOURS` controls how long generated output files are kept +- `CLEANUP_ON_STARTUP=true` removes expired generated files when the app starts +- `CLEANUP_AFTER_DOWNLOAD=true` deletes a result only after the response finishes sending diff --git a/app/__init__.py b/app/__init__.py index 0399324..9ce12cb 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -5,6 +5,7 @@ from werkzeug.exceptions import RequestEntityTooLarge from app.config import Config from app.routes import main_blueprint +from app.services.storage import cleanup_expired_outputs def _format_size_limit(size_limit_bytes: int) -> str: @@ -27,6 +28,12 @@ def create_app(config_class: type[Config] = Config) -> Flask: app.config["OUTPUT_DIRECTORY"] = output_dir output_dir.mkdir(parents=True, exist_ok=True) + if app.config.get("CLEANUP_ON_STARTUP", False): + cleanup_expired_outputs( + output_dir=output_dir, + retention_hours=app.config.get("OUTPUT_RETENTION_HOURS", 24), + ) + app.register_blueprint(main_blueprint) @app.errorhandler(RequestEntityTooLarge) diff --git a/app/config.py b/app/config.py index 814f092..6d84372 100644 --- a/app/config.py +++ b/app/config.py @@ -2,6 +2,14 @@ import os from pathlib import Path +def _get_bool_setting(name: str, default: bool) -> bool: + """Parse conventional boolean environment values.""" + value = os.environ.get(name) + if value is None: + return default + return value.strip().lower() in {"1", "true", "yes", "on"} + + def _get_max_content_length() -> int: """Resolve the upload size limit from environment settings.""" upload_limit_mb = os.environ.get("MAX_UPLOAD_SIZE_MB") @@ -25,3 +33,6 @@ class Config: OUTPUT_DIRECTORY = Path( os.environ.get("OUTPUT_DIRECTORY", Path("instance") / "outputs") ) + OUTPUT_RETENTION_HOURS = int(os.environ.get("OUTPUT_RETENTION_HOURS", 24)) + CLEANUP_ON_STARTUP = _get_bool_setting("CLEANUP_ON_STARTUP", True) + CLEANUP_AFTER_DOWNLOAD = _get_bool_setting("CLEANUP_AFTER_DOWNLOAD", False) diff --git a/app/routes.py b/app/routes.py index a79b245..33b4378 100644 --- a/app/routes.py +++ b/app/routes.py @@ -12,12 +12,13 @@ from flask import ( url_for, ) from werkzeug.datastructures import FileStorage +from werkzeug.wsgi import ClosingIterator from app.constants import MODES, OUTPUT_FORMATS, SORTABLE_FIELDS, SORT_ORDERS from app.services.conversion import convert_uploaded_log from app.services.parser import LogParseError from app.services.processing import ProcessingError, ProcessingOptions -from app.services.storage import load_result_metadata +from app.services.storage import delete_result_files, load_result_metadata main_blueprint = Blueprint("main", __name__) @@ -139,10 +140,17 @@ def download(result_id: str): flash("Requested output file could not be found.", "danger") return redirect(url_for("main.index")) - return send_file( + response = send_file( Path(metadata["file_path"]), as_attachment=True, download_name=metadata["download_name"], mimetype=metadata["mimetype"], max_age=0, ) + if current_app.config.get("CLEANUP_AFTER_DOWNLOAD", False): + output_dir = current_app.config["OUTPUT_DIRECTORY"] + response.response = ClosingIterator( + response.response, + [lambda: delete_result_files(output_dir=output_dir, result_id=result_id)], + ) + return response diff --git a/app/services/storage.py b/app/services/storage.py index b2dde1b..7b0ed95 100644 --- a/app/services/storage.py +++ b/app/services/storage.py @@ -1,6 +1,7 @@ import json import uuid from dataclasses import asdict, dataclass +from datetime import datetime, timedelta, timezone from pathlib import Path from app.services.exporter import ExportResult, write_export @@ -14,6 +15,12 @@ class ResultMetadata: mimetype: str +def _result_paths(output_dir: Path, result_id: str) -> tuple[Path, Path]: + """Build the sidecar metadata and output file search pattern for a result id.""" + metadata_path = output_dir / f"{result_id}.json" + return metadata_path, output_dir / f"{result_id}" + + def persist_result( output_dir: Path, records: list[dict[str, str]], @@ -50,8 +57,53 @@ def persist_result( def load_result_metadata(output_dir: Path, result_id: str) -> dict[str, str] | None: """Load sidecar metadata for a generated file.""" - metadata_path = output_dir / f"{result_id}.json" + metadata_path, _base_path = _result_paths(output_dir, result_id) if not metadata_path.exists(): return None return json.loads(metadata_path.read_text(encoding="utf-8")) + + +def delete_result_files(output_dir: Path, result_id: str) -> None: + """Delete a generated output file and its metadata sidecar if they still exist.""" + metadata_path, base_path = _result_paths(output_dir, result_id) + for output_file in output_dir.glob(f"{base_path.name}.*"): + if output_file.name == metadata_path.name: + continue + output_file.unlink(missing_ok=True) + metadata_path.unlink(missing_ok=True) + + +def cleanup_expired_outputs(output_dir: Path, retention_hours: int) -> int: + """Delete generated output sets older than the configured retention window.""" + cutoff = datetime.now(timezone.utc) - timedelta(hours=retention_hours) + deleted_results = 0 + + for metadata_path in output_dir.glob("*.json"): + try: + payload = json.loads(metadata_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + payload = {} + + result_id = payload.get("result_id") or metadata_path.stem + file_path = Path(payload["file_path"]) if "file_path" in payload else None + newest_mtime = _newest_mtime(metadata_path, file_path) + if newest_mtime is None or newest_mtime >= cutoff: + continue + + delete_result_files(output_dir=output_dir, result_id=result_id) + deleted_results += 1 + + return deleted_results + + +def _newest_mtime(metadata_path: Path, file_path: Path | None) -> datetime | None: + """Return the newest modification time across the metadata and output file.""" + mtimes: list[datetime] = [] + if metadata_path.exists(): + mtimes.append(datetime.fromtimestamp(metadata_path.stat().st_mtime, tz=timezone.utc)) + if file_path is not None and file_path.exists(): + mtimes.append(datetime.fromtimestamp(file_path.stat().st_mtime, tz=timezone.utc)) + if not mtimes: + return None + return max(mtimes) diff --git a/env b/env index ddf01ce..6511618 100644 --- a/env +++ b/env @@ -1,3 +1,6 @@ SECRET_KEY=change-me MAX_UPLOAD_SIZE_MB=120 OUTPUT_DIRECTORY=/app/instance/outputs +OUTPUT_RETENTION_HOURS=24 +CLEANUP_ON_STARTUP=true +CLEANUP_AFTER_DOWNLOAD=false diff --git a/tests/conftest.py b/tests/conftest.py index 59c868c..bb48f41 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,9 @@ class TestConfig: MAX_CONTENT_LENGTH = 100 * 1024 * 1024 PREVIEW_RECORD_LIMIT = 5 OUTPUT_DIRECTORY = "test-outputs" + OUTPUT_RETENTION_HOURS = 24 + CLEANUP_ON_STARTUP = False + CLEANUP_AFTER_DOWNLOAD = False @pytest.fixture() diff --git a/tests/test_app.py b/tests/test_app.py index 6351c21..e8462b7 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,4 +1,6 @@ import io +import json +from pathlib import Path from app import create_app @@ -132,6 +134,95 @@ def test_download_route_returns_generated_file(client): download_response.close() +def test_download_route_can_cleanup_files_after_download(tmp_path): + class CleanupAfterDownloadConfig: + TESTING = True + SECRET_KEY = "test-secret" + MAX_CONTENT_LENGTH = 100 * 1024 * 1024 + PREVIEW_RECORD_LIMIT = 5 + OUTPUT_DIRECTORY = tmp_path / "download-cleanup-outputs" + OUTPUT_RETENTION_HOURS = 24 + CLEANUP_ON_STARTUP = False + CLEANUP_AFTER_DOWNLOAD = True + + app = create_app(CleanupAfterDownloadConfig) + client = app.test_client() + log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8")) + + convert_response = client.post( + "/convert", + data={ + "mode": "vendor", + "output_format": "csv", + "sort_by": "datetime", + "order": "asc", + "policy_cs": "", + "policy_ci": "", + "severity_cs": "", + "severity_ci": "", + "log_file": (log_file, "sample.log"), + }, + content_type="multipart/form-data", + ) + log_file.close() + + html = convert_response.data.decode("utf-8") + marker = "/download/" + start = html.index(marker) + len(marker) + end = html.index('"', start) + result_id = html[start:end] + metadata_path = Path(app.config["OUTPUT_DIRECTORY"]) / f"{result_id}.json" + + download_response = client.get(f"/download/{result_id}") + download_response.close() + convert_response.close() + + assert not metadata_path.exists() + + +def test_cleanup_on_startup_removes_expired_outputs(tmp_path): + output_dir = tmp_path / "startup-cleanup-outputs" + output_dir.mkdir(parents=True) + result_id = "expired-result" + file_path = output_dir / f"{result_id}.csv" + metadata_path = output_dir / f"{result_id}.json" + file_path.write_text("header\nvalue\n", encoding="utf-8") + metadata_path.write_text( + json.dumps( + { + "result_id": result_id, + "file_path": str(file_path), + "download_name": "waf-report.csv", + "mimetype": "text/csv; charset=utf-8", + } + ), + encoding="utf-8", + ) + old_timestamp = 946684800 + file_path.touch() + metadata_path.touch() + Path(file_path).touch() + import os + + os.utime(file_path, (old_timestamp, old_timestamp)) + os.utime(metadata_path, (old_timestamp, old_timestamp)) + + class StartupCleanupConfig: + TESTING = True + SECRET_KEY = "test-secret" + MAX_CONTENT_LENGTH = 100 * 1024 * 1024 + PREVIEW_RECORD_LIMIT = 5 + OUTPUT_DIRECTORY = output_dir + OUTPUT_RETENTION_HOURS = 1 + CLEANUP_ON_STARTUP = True + CLEANUP_AFTER_DOWNLOAD = False + + create_app(StartupCleanupConfig) + + assert not file_path.exists() + assert not metadata_path.exists() + + def test_default_upload_limit_is_100_mib(app): assert app.config["MAX_CONTENT_LENGTH"] == 100 * 1024 * 1024 diff --git a/tests/test_storage.py b/tests/test_storage.py index 003eb3b..853d177 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,6 +1,8 @@ +import json +import os from pathlib import Path -from app.services.storage import persist_result +from app.services.storage import cleanup_expired_outputs, delete_result_files, persist_result def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path): @@ -32,3 +34,42 @@ def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path): assert "v015xxxxdate,time,policy,severity_level" in written assert "2024-05-01,10:00:00,Prod Policy,high" in written assert export_result.preview(1).count("\n") == 1 + + +def test_delete_result_files_removes_output_and_metadata(tmp_path: Path): + result_id = "delete-me" + output_file = tmp_path / f"{result_id}.txt" + metadata_file = tmp_path / f"{result_id}.json" + output_file.write_text("content", encoding="utf-8") + metadata_file.write_text("{}", encoding="utf-8") + + delete_result_files(output_dir=tmp_path, result_id=result_id) + + assert not output_file.exists() + assert not metadata_file.exists() + + +def test_cleanup_expired_outputs_removes_only_old_results(tmp_path: Path): + old_result_id = "old-result" + new_result_id = "new-result" + old_output = tmp_path / f"{old_result_id}.csv" + old_metadata = tmp_path / f"{old_result_id}.json" + new_output = tmp_path / f"{new_result_id}.csv" + new_metadata = tmp_path / f"{new_result_id}.json" + + old_output.write_text("old", encoding="utf-8") + new_output.write_text("new", encoding="utf-8") + old_metadata.write_text(json.dumps({"result_id": old_result_id, "file_path": str(old_output)}), encoding="utf-8") + new_metadata.write_text(json.dumps({"result_id": new_result_id, "file_path": str(new_output)}), encoding="utf-8") + + old_timestamp = 946684800 + os.utime(old_output, (old_timestamp, old_timestamp)) + os.utime(old_metadata, (old_timestamp, old_timestamp)) + + deleted_results = cleanup_expired_outputs(output_dir=tmp_path, retention_hours=1) + + assert deleted_results == 1 + assert not old_output.exists() + assert not old_metadata.exists() + assert new_output.exists() + assert new_metadata.exists()