Add output cleanup policy
This commit is contained in:
@@ -91,6 +91,9 @@ Compose settings are stored in `env`. Update that file to change values such as:
|
|||||||
- `SECRET_KEY`
|
- `SECRET_KEY`
|
||||||
- `MAX_UPLOAD_SIZE_MB`
|
- `MAX_UPLOAD_SIZE_MB`
|
||||||
- `OUTPUT_DIRECTORY`
|
- `OUTPUT_DIRECTORY`
|
||||||
|
- `OUTPUT_RETENTION_HOURS`
|
||||||
|
- `CLEANUP_ON_STARTUP`
|
||||||
|
- `CLEANUP_AFTER_DOWNLOAD`
|
||||||
|
|
||||||
### Run the test suite in a container
|
### Run the test suite in a container
|
||||||
|
|
||||||
@@ -126,6 +129,7 @@ curl -X POST http://127.0.0.1:5000/convert \
|
|||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- Temporary output files are written to `instance/outputs`
|
- Temporary output files are written to `instance/outputs`
|
||||||
|
- Generated files are cleaned up according to the configured output retention policy
|
||||||
- The application does not require a database
|
- The application does not require a database
|
||||||
- Gunicorn is used as the production WSGI server
|
- Gunicorn is used as the production WSGI server
|
||||||
- Parsing and export writing are streamed to reduce memory usage on large uploads
|
- Parsing and export writing are streamed to reduce memory usage on large uploads
|
||||||
@@ -133,3 +137,6 @@ curl -X POST http://127.0.0.1:5000/convert \
|
|||||||
- Default upload limit is 100 MiB
|
- Default upload limit is 100 MiB
|
||||||
- Set `MAX_UPLOAD_SIZE_MB` to configure the upload limit in megabytes
|
- Set `MAX_UPLOAD_SIZE_MB` to configure the upload limit in megabytes
|
||||||
- `MAX_CONTENT_LENGTH` is also supported as a lower-level byte-based override
|
- `MAX_CONTENT_LENGTH` is also supported as a lower-level byte-based override
|
||||||
|
- `OUTPUT_RETENTION_HOURS` controls how long generated output files are kept
|
||||||
|
- `CLEANUP_ON_STARTUP=true` removes expired generated files when the app starts
|
||||||
|
- `CLEANUP_AFTER_DOWNLOAD=true` deletes a result only after the response finishes sending
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from werkzeug.exceptions import RequestEntityTooLarge
|
|||||||
|
|
||||||
from app.config import Config
|
from app.config import Config
|
||||||
from app.routes import main_blueprint
|
from app.routes import main_blueprint
|
||||||
|
from app.services.storage import cleanup_expired_outputs
|
||||||
|
|
||||||
|
|
||||||
def _format_size_limit(size_limit_bytes: int) -> str:
|
def _format_size_limit(size_limit_bytes: int) -> str:
|
||||||
@@ -27,6 +28,12 @@ def create_app(config_class: type[Config] = Config) -> Flask:
|
|||||||
app.config["OUTPUT_DIRECTORY"] = output_dir
|
app.config["OUTPUT_DIRECTORY"] = output_dir
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if app.config.get("CLEANUP_ON_STARTUP", False):
|
||||||
|
cleanup_expired_outputs(
|
||||||
|
output_dir=output_dir,
|
||||||
|
retention_hours=app.config.get("OUTPUT_RETENTION_HOURS", 24),
|
||||||
|
)
|
||||||
|
|
||||||
app.register_blueprint(main_blueprint)
|
app.register_blueprint(main_blueprint)
|
||||||
|
|
||||||
@app.errorhandler(RequestEntityTooLarge)
|
@app.errorhandler(RequestEntityTooLarge)
|
||||||
|
|||||||
@@ -2,6 +2,14 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def _get_bool_setting(name: str, default: bool) -> bool:
|
||||||
|
"""Parse conventional boolean environment values."""
|
||||||
|
value = os.environ.get(name)
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
|
||||||
def _get_max_content_length() -> int:
|
def _get_max_content_length() -> int:
|
||||||
"""Resolve the upload size limit from environment settings."""
|
"""Resolve the upload size limit from environment settings."""
|
||||||
upload_limit_mb = os.environ.get("MAX_UPLOAD_SIZE_MB")
|
upload_limit_mb = os.environ.get("MAX_UPLOAD_SIZE_MB")
|
||||||
@@ -25,3 +33,6 @@ class Config:
|
|||||||
OUTPUT_DIRECTORY = Path(
|
OUTPUT_DIRECTORY = Path(
|
||||||
os.environ.get("OUTPUT_DIRECTORY", Path("instance") / "outputs")
|
os.environ.get("OUTPUT_DIRECTORY", Path("instance") / "outputs")
|
||||||
)
|
)
|
||||||
|
OUTPUT_RETENTION_HOURS = int(os.environ.get("OUTPUT_RETENTION_HOURS", 24))
|
||||||
|
CLEANUP_ON_STARTUP = _get_bool_setting("CLEANUP_ON_STARTUP", True)
|
||||||
|
CLEANUP_AFTER_DOWNLOAD = _get_bool_setting("CLEANUP_AFTER_DOWNLOAD", False)
|
||||||
|
|||||||
@@ -12,12 +12,13 @@ from flask import (
|
|||||||
url_for,
|
url_for,
|
||||||
)
|
)
|
||||||
from werkzeug.datastructures import FileStorage
|
from werkzeug.datastructures import FileStorage
|
||||||
|
from werkzeug.wsgi import ClosingIterator
|
||||||
|
|
||||||
from app.constants import MODES, OUTPUT_FORMATS, SORTABLE_FIELDS, SORT_ORDERS
|
from app.constants import MODES, OUTPUT_FORMATS, SORTABLE_FIELDS, SORT_ORDERS
|
||||||
from app.services.conversion import convert_uploaded_log
|
from app.services.conversion import convert_uploaded_log
|
||||||
from app.services.parser import LogParseError
|
from app.services.parser import LogParseError
|
||||||
from app.services.processing import ProcessingError, ProcessingOptions
|
from app.services.processing import ProcessingError, ProcessingOptions
|
||||||
from app.services.storage import load_result_metadata
|
from app.services.storage import delete_result_files, load_result_metadata
|
||||||
|
|
||||||
main_blueprint = Blueprint("main", __name__)
|
main_blueprint = Blueprint("main", __name__)
|
||||||
|
|
||||||
@@ -139,10 +140,17 @@ def download(result_id: str):
|
|||||||
flash("Requested output file could not be found.", "danger")
|
flash("Requested output file could not be found.", "danger")
|
||||||
return redirect(url_for("main.index"))
|
return redirect(url_for("main.index"))
|
||||||
|
|
||||||
return send_file(
|
response = send_file(
|
||||||
Path(metadata["file_path"]),
|
Path(metadata["file_path"]),
|
||||||
as_attachment=True,
|
as_attachment=True,
|
||||||
download_name=metadata["download_name"],
|
download_name=metadata["download_name"],
|
||||||
mimetype=metadata["mimetype"],
|
mimetype=metadata["mimetype"],
|
||||||
max_age=0,
|
max_age=0,
|
||||||
)
|
)
|
||||||
|
if current_app.config.get("CLEANUP_AFTER_DOWNLOAD", False):
|
||||||
|
output_dir = current_app.config["OUTPUT_DIRECTORY"]
|
||||||
|
response.response = ClosingIterator(
|
||||||
|
response.response,
|
||||||
|
[lambda: delete_result_files(output_dir=output_dir, result_id=result_id)],
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import uuid
|
import uuid
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from app.services.exporter import ExportResult, write_export
|
from app.services.exporter import ExportResult, write_export
|
||||||
@@ -14,6 +15,12 @@ class ResultMetadata:
|
|||||||
mimetype: str
|
mimetype: str
|
||||||
|
|
||||||
|
|
||||||
|
def _result_paths(output_dir: Path, result_id: str) -> tuple[Path, Path]:
|
||||||
|
"""Build the sidecar metadata and output file search pattern for a result id."""
|
||||||
|
metadata_path = output_dir / f"{result_id}.json"
|
||||||
|
return metadata_path, output_dir / f"{result_id}"
|
||||||
|
|
||||||
|
|
||||||
def persist_result(
|
def persist_result(
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
records: list[dict[str, str]],
|
records: list[dict[str, str]],
|
||||||
@@ -50,8 +57,53 @@ def persist_result(
|
|||||||
|
|
||||||
def load_result_metadata(output_dir: Path, result_id: str) -> dict[str, str] | None:
|
def load_result_metadata(output_dir: Path, result_id: str) -> dict[str, str] | None:
|
||||||
"""Load sidecar metadata for a generated file."""
|
"""Load sidecar metadata for a generated file."""
|
||||||
metadata_path = output_dir / f"{result_id}.json"
|
metadata_path, _base_path = _result_paths(output_dir, result_id)
|
||||||
if not metadata_path.exists():
|
if not metadata_path.exists():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return json.loads(metadata_path.read_text(encoding="utf-8"))
|
return json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def delete_result_files(output_dir: Path, result_id: str) -> None:
|
||||||
|
"""Delete a generated output file and its metadata sidecar if they still exist."""
|
||||||
|
metadata_path, base_path = _result_paths(output_dir, result_id)
|
||||||
|
for output_file in output_dir.glob(f"{base_path.name}.*"):
|
||||||
|
if output_file.name == metadata_path.name:
|
||||||
|
continue
|
||||||
|
output_file.unlink(missing_ok=True)
|
||||||
|
metadata_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_expired_outputs(output_dir: Path, retention_hours: int) -> int:
|
||||||
|
"""Delete generated output sets older than the configured retention window."""
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(hours=retention_hours)
|
||||||
|
deleted_results = 0
|
||||||
|
|
||||||
|
for metadata_path in output_dir.glob("*.json"):
|
||||||
|
try:
|
||||||
|
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||||
|
except (OSError, json.JSONDecodeError):
|
||||||
|
payload = {}
|
||||||
|
|
||||||
|
result_id = payload.get("result_id") or metadata_path.stem
|
||||||
|
file_path = Path(payload["file_path"]) if "file_path" in payload else None
|
||||||
|
newest_mtime = _newest_mtime(metadata_path, file_path)
|
||||||
|
if newest_mtime is None or newest_mtime >= cutoff:
|
||||||
|
continue
|
||||||
|
|
||||||
|
delete_result_files(output_dir=output_dir, result_id=result_id)
|
||||||
|
deleted_results += 1
|
||||||
|
|
||||||
|
return deleted_results
|
||||||
|
|
||||||
|
|
||||||
|
def _newest_mtime(metadata_path: Path, file_path: Path | None) -> datetime | None:
|
||||||
|
"""Return the newest modification time across the metadata and output file."""
|
||||||
|
mtimes: list[datetime] = []
|
||||||
|
if metadata_path.exists():
|
||||||
|
mtimes.append(datetime.fromtimestamp(metadata_path.stat().st_mtime, tz=timezone.utc))
|
||||||
|
if file_path is not None and file_path.exists():
|
||||||
|
mtimes.append(datetime.fromtimestamp(file_path.stat().st_mtime, tz=timezone.utc))
|
||||||
|
if not mtimes:
|
||||||
|
return None
|
||||||
|
return max(mtimes)
|
||||||
|
|||||||
3
env
3
env
@@ -1,3 +1,6 @@
|
|||||||
SECRET_KEY=change-me
|
SECRET_KEY=change-me
|
||||||
MAX_UPLOAD_SIZE_MB=120
|
MAX_UPLOAD_SIZE_MB=120
|
||||||
OUTPUT_DIRECTORY=/app/instance/outputs
|
OUTPUT_DIRECTORY=/app/instance/outputs
|
||||||
|
OUTPUT_RETENTION_HOURS=24
|
||||||
|
CLEANUP_ON_STARTUP=true
|
||||||
|
CLEANUP_AFTER_DOWNLOAD=false
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ class TestConfig:
|
|||||||
MAX_CONTENT_LENGTH = 100 * 1024 * 1024
|
MAX_CONTENT_LENGTH = 100 * 1024 * 1024
|
||||||
PREVIEW_RECORD_LIMIT = 5
|
PREVIEW_RECORD_LIMIT = 5
|
||||||
OUTPUT_DIRECTORY = "test-outputs"
|
OUTPUT_DIRECTORY = "test-outputs"
|
||||||
|
OUTPUT_RETENTION_HOURS = 24
|
||||||
|
CLEANUP_ON_STARTUP = False
|
||||||
|
CLEANUP_AFTER_DOWNLOAD = False
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
import io
|
import io
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from app import create_app
|
from app import create_app
|
||||||
|
|
||||||
@@ -132,6 +134,95 @@ def test_download_route_returns_generated_file(client):
|
|||||||
download_response.close()
|
download_response.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_route_can_cleanup_files_after_download(tmp_path):
|
||||||
|
class CleanupAfterDownloadConfig:
|
||||||
|
TESTING = True
|
||||||
|
SECRET_KEY = "test-secret"
|
||||||
|
MAX_CONTENT_LENGTH = 100 * 1024 * 1024
|
||||||
|
PREVIEW_RECORD_LIMIT = 5
|
||||||
|
OUTPUT_DIRECTORY = tmp_path / "download-cleanup-outputs"
|
||||||
|
OUTPUT_RETENTION_HOURS = 24
|
||||||
|
CLEANUP_ON_STARTUP = False
|
||||||
|
CLEANUP_AFTER_DOWNLOAD = True
|
||||||
|
|
||||||
|
app = create_app(CleanupAfterDownloadConfig)
|
||||||
|
client = app.test_client()
|
||||||
|
log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8"))
|
||||||
|
|
||||||
|
convert_response = client.post(
|
||||||
|
"/convert",
|
||||||
|
data={
|
||||||
|
"mode": "vendor",
|
||||||
|
"output_format": "csv",
|
||||||
|
"sort_by": "datetime",
|
||||||
|
"order": "asc",
|
||||||
|
"policy_cs": "",
|
||||||
|
"policy_ci": "",
|
||||||
|
"severity_cs": "",
|
||||||
|
"severity_ci": "",
|
||||||
|
"log_file": (log_file, "sample.log"),
|
||||||
|
},
|
||||||
|
content_type="multipart/form-data",
|
||||||
|
)
|
||||||
|
log_file.close()
|
||||||
|
|
||||||
|
html = convert_response.data.decode("utf-8")
|
||||||
|
marker = "/download/"
|
||||||
|
start = html.index(marker) + len(marker)
|
||||||
|
end = html.index('"', start)
|
||||||
|
result_id = html[start:end]
|
||||||
|
metadata_path = Path(app.config["OUTPUT_DIRECTORY"]) / f"{result_id}.json"
|
||||||
|
|
||||||
|
download_response = client.get(f"/download/{result_id}")
|
||||||
|
download_response.close()
|
||||||
|
convert_response.close()
|
||||||
|
|
||||||
|
assert not metadata_path.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_cleanup_on_startup_removes_expired_outputs(tmp_path):
|
||||||
|
output_dir = tmp_path / "startup-cleanup-outputs"
|
||||||
|
output_dir.mkdir(parents=True)
|
||||||
|
result_id = "expired-result"
|
||||||
|
file_path = output_dir / f"{result_id}.csv"
|
||||||
|
metadata_path = output_dir / f"{result_id}.json"
|
||||||
|
file_path.write_text("header\nvalue\n", encoding="utf-8")
|
||||||
|
metadata_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"result_id": result_id,
|
||||||
|
"file_path": str(file_path),
|
||||||
|
"download_name": "waf-report.csv",
|
||||||
|
"mimetype": "text/csv; charset=utf-8",
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
old_timestamp = 946684800
|
||||||
|
file_path.touch()
|
||||||
|
metadata_path.touch()
|
||||||
|
Path(file_path).touch()
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.utime(file_path, (old_timestamp, old_timestamp))
|
||||||
|
os.utime(metadata_path, (old_timestamp, old_timestamp))
|
||||||
|
|
||||||
|
class StartupCleanupConfig:
|
||||||
|
TESTING = True
|
||||||
|
SECRET_KEY = "test-secret"
|
||||||
|
MAX_CONTENT_LENGTH = 100 * 1024 * 1024
|
||||||
|
PREVIEW_RECORD_LIMIT = 5
|
||||||
|
OUTPUT_DIRECTORY = output_dir
|
||||||
|
OUTPUT_RETENTION_HOURS = 1
|
||||||
|
CLEANUP_ON_STARTUP = True
|
||||||
|
CLEANUP_AFTER_DOWNLOAD = False
|
||||||
|
|
||||||
|
create_app(StartupCleanupConfig)
|
||||||
|
|
||||||
|
assert not file_path.exists()
|
||||||
|
assert not metadata_path.exists()
|
||||||
|
|
||||||
|
|
||||||
def test_default_upload_limit_is_100_mib(app):
|
def test_default_upload_limit_is_100_mib(app):
|
||||||
assert app.config["MAX_CONTENT_LENGTH"] == 100 * 1024 * 1024
|
assert app.config["MAX_CONTENT_LENGTH"] == 100 * 1024 * 1024
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from app.services.storage import persist_result
|
from app.services.storage import cleanup_expired_outputs, delete_result_files, persist_result
|
||||||
|
|
||||||
|
|
||||||
def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path):
|
def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path):
|
||||||
@@ -32,3 +34,42 @@ def test_persist_result_writes_csv_and_collects_preview(tmp_path: Path):
|
|||||||
assert "v015xxxxdate,time,policy,severity_level" in written
|
assert "v015xxxxdate,time,policy,severity_level" in written
|
||||||
assert "2024-05-01,10:00:00,Prod Policy,high" in written
|
assert "2024-05-01,10:00:00,Prod Policy,high" in written
|
||||||
assert export_result.preview(1).count("\n") == 1
|
assert export_result.preview(1).count("\n") == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_result_files_removes_output_and_metadata(tmp_path: Path):
|
||||||
|
result_id = "delete-me"
|
||||||
|
output_file = tmp_path / f"{result_id}.txt"
|
||||||
|
metadata_file = tmp_path / f"{result_id}.json"
|
||||||
|
output_file.write_text("content", encoding="utf-8")
|
||||||
|
metadata_file.write_text("{}", encoding="utf-8")
|
||||||
|
|
||||||
|
delete_result_files(output_dir=tmp_path, result_id=result_id)
|
||||||
|
|
||||||
|
assert not output_file.exists()
|
||||||
|
assert not metadata_file.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_cleanup_expired_outputs_removes_only_old_results(tmp_path: Path):
|
||||||
|
old_result_id = "old-result"
|
||||||
|
new_result_id = "new-result"
|
||||||
|
old_output = tmp_path / f"{old_result_id}.csv"
|
||||||
|
old_metadata = tmp_path / f"{old_result_id}.json"
|
||||||
|
new_output = tmp_path / f"{new_result_id}.csv"
|
||||||
|
new_metadata = tmp_path / f"{new_result_id}.json"
|
||||||
|
|
||||||
|
old_output.write_text("old", encoding="utf-8")
|
||||||
|
new_output.write_text("new", encoding="utf-8")
|
||||||
|
old_metadata.write_text(json.dumps({"result_id": old_result_id, "file_path": str(old_output)}), encoding="utf-8")
|
||||||
|
new_metadata.write_text(json.dumps({"result_id": new_result_id, "file_path": str(new_output)}), encoding="utf-8")
|
||||||
|
|
||||||
|
old_timestamp = 946684800
|
||||||
|
os.utime(old_output, (old_timestamp, old_timestamp))
|
||||||
|
os.utime(old_metadata, (old_timestamp, old_timestamp))
|
||||||
|
|
||||||
|
deleted_results = cleanup_expired_outputs(output_dir=tmp_path, retention_hours=1)
|
||||||
|
|
||||||
|
assert deleted_results == 1
|
||||||
|
assert not old_output.exists()
|
||||||
|
assert not old_metadata.exists()
|
||||||
|
assert new_output.exists()
|
||||||
|
assert new_metadata.exists()
|
||||||
|
|||||||
Reference in New Issue
Block a user