diff --git a/README.md b/README.md index dc69a5e..5cd2aa1 100644 --- a/README.md +++ b/README.md @@ -42,11 +42,20 @@ pip install -e ".[dev]" ```bash export FLASK_APP=wsgi.py +export MAX_UPLOAD_SIZE_MB=100 flask run --debug ``` Open `http://127.0.0.1:5000`. +### Example input file + +If you have a local WAF export such as `attack_download.log`, you can use it as a real example upload. + +- Example file: `attack_download.log` +- Approximate size in the current workspace: `98.5 MiB` +- The default `MAX_UPLOAD_SIZE_MB=100` setting is sized to accept a file of that size + ### Test ```bash @@ -64,7 +73,7 @@ docker build -t webfortilog . ### Run ```bash -docker run --rm -p 8000:8000 webfortilog +docker run --rm -p 8000:8000 -e MAX_UPLOAD_SIZE_MB=100 webfortilog ``` Open `http://127.0.0.1:8000`. @@ -83,8 +92,36 @@ docker compose up --build web docker compose run --rm test ``` +## Example usage + +### Browser upload + +1. Start the app with `flask run --debug` or `docker compose up --build web` +2. Open the web UI +3. Upload `attack_download.log` +4. Try `vendor` mode with `text` output for a readable preview +5. Try `full` mode with `csv` output for complete export coverage + +### Command-line upload example + +```bash +curl -X POST http://127.0.0.1:5000/convert \ + -F "log_file=@attack_download.log" \ + -F "mode=vendor" \ + -F "output_format=text" \ + -F "sort_by=datetime" \ + -F "order=asc" \ + -F "policy_cs=" \ + -F "policy_ci=" \ + -F "severity_cs=" \ + -F "severity_ci=" +``` + ## Notes - Temporary output files are written to `instance/outputs` - The application does not require a database - Gunicorn is used as the production WSGI server +- Default upload limit is 100 MiB +- Set `MAX_UPLOAD_SIZE_MB` to configure the upload limit in megabytes +- `MAX_CONTENT_LENGTH` is also supported as a lower-level byte-based override diff --git a/app/__init__.py b/app/__init__.py index 387d243..0399324 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -7,6 +7,15 @@ from app.config import Config from app.routes import main_blueprint +def _format_size_limit(size_limit_bytes: int) -> str: + """Render the upload limit in a friendly unit for error messages.""" + if size_limit_bytes >= 1024 * 1024: + return f"{size_limit_bytes / (1024 * 1024):.0f} MB" + if size_limit_bytes >= 1024: + return f"{size_limit_bytes / 1024:.0f} KB" + return f"{size_limit_bytes} bytes" + + def create_app(config_class: type[Config] = Config) -> Flask: """Application factory used by Flask and Gunicorn.""" app = Flask(__name__, instance_relative_config=True) @@ -22,7 +31,11 @@ def create_app(config_class: type[Config] = Config) -> Flask: @app.errorhandler(RequestEntityTooLarge) def handle_file_too_large(_error): - flash("The uploaded file is too large.", "danger") + size_limit_bytes = int(app.config["MAX_CONTENT_LENGTH"]) + flash( + f"The uploaded file is too large. Maximum allowed size is {_format_size_limit(size_limit_bytes)}.", + "danger", + ) return render_template("index.html"), 413 return app diff --git a/app/config.py b/app/config.py index 441a55e..814f092 100644 --- a/app/config.py +++ b/app/config.py @@ -2,11 +2,25 @@ import os from pathlib import Path +def _get_max_content_length() -> int: + """Resolve the upload size limit from environment settings.""" + upload_limit_mb = os.environ.get("MAX_UPLOAD_SIZE_MB") + if upload_limit_mb: + return int(upload_limit_mb) * 1024 * 1024 + + max_content_length = os.environ.get("MAX_CONTENT_LENGTH") + if max_content_length: + return int(max_content_length) + + return 100 * 1024 * 1024 + + class Config: """Default configuration for local and container usage.""" SECRET_KEY = os.environ.get("SECRET_KEY", "dev-secret-key-change-me") - MAX_CONTENT_LENGTH = int(os.environ.get("MAX_CONTENT_LENGTH", 10 * 1024 * 1024)) + # Default to 100 MiB so larger WAF exports can be processed without tuning. + MAX_CONTENT_LENGTH = _get_max_content_length() PREVIEW_RECORD_LIMIT = int(os.environ.get("PREVIEW_RECORD_LIMIT", 5)) OUTPUT_DIRECTORY = Path( os.environ.get("OUTPUT_DIRECTORY", Path("instance") / "outputs") diff --git a/app/routes.py b/app/routes.py index 1390d9c..0aff113 100644 --- a/app/routes.py +++ b/app/routes.py @@ -116,7 +116,10 @@ def convert(): flash(str(exc), "danger") return render_template("index.html", form=form), 400 except UnicodeDecodeError: - flash("The uploaded file is not valid UTF-8 text.", "danger") + flash( + "The uploaded file could not be decoded. Supported encodings are UTF-8, UTF-8 with BOM, Windows-1252, and Latin-1.", + "danger", + ) return render_template("index.html", form=form), 400 preview_limit = current_app.config["PREVIEW_RECORD_LIMIT"] diff --git a/app/services/parser.py b/app/services/parser.py index 13cd50e..4f362cc 100644 --- a/app/services/parser.py +++ b/app/services/parser.py @@ -1,4 +1,3 @@ -import shlex from collections import OrderedDict from io import BufferedIOBase, TextIOBase @@ -7,13 +6,68 @@ class LogParseError(ValueError): """Raised when the uploaded log file cannot be parsed.""" -def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]: - """Parse a UTF-8 log file where each line contains shell-like key/value tokens.""" - raw_bytes = stream.read() +def _decode_log_content(raw_bytes: bytes | str) -> str: + """Decode uploaded log content using practical text encodings seen in exports.""" if isinstance(raw_bytes, str): - content = raw_bytes - else: - content = raw_bytes.decode("utf-8") + return raw_bytes + + for encoding in ("utf-8-sig", "cp1252", "latin-1"): + try: + return raw_bytes.decode(encoding) + except UnicodeDecodeError: + continue + + raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.") + + +def _tokenize_line(line: str) -> list[str]: + """Split a line using shell-like rules while tolerating unmatched trailing quotes.""" + tokens: list[str] = [] + current: list[str] = [] + quote_char: str | None = None + escape_next = False + + for char in line: + if escape_next: + current.append(char) + escape_next = False + continue + + if char == "\\": + escape_next = True + continue + + if quote_char is not None: + if char == quote_char: + quote_char = None + else: + current.append(char) + continue + + if char in {'"', "'"}: + quote_char = char + continue + + if char.isspace(): + if current: + tokens.append("".join(current)) + current = [] + continue + + current.append(char) + + if escape_next: + current.append("\\") + if current: + tokens.append("".join(current)) + + return tokens + + +def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]: + """Parse a text log file where each line contains shell-like key/value tokens.""" + raw_bytes = stream.read() + content = _decode_log_content(raw_bytes) records: list[dict[str, str]] = [] seen_keys: OrderedDict[str, None] = OrderedDict() @@ -23,10 +77,7 @@ def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, if not line: continue - try: - tokens = shlex.split(line, posix=True) - except ValueError as exc: - raise LogParseError(f"Line {line_number}: invalid shell-style quoting.") from exc + tokens = _tokenize_line(line) record: dict[str, str] = {} for token in tokens: diff --git a/compose.yaml b/compose.yaml index dc0d2f3..083cce3 100644 --- a/compose.yaml +++ b/compose.yaml @@ -7,6 +7,7 @@ services: - "8000:8000" environment: SECRET_KEY: change-me + MAX_UPLOAD_SIZE_MB: "100" OUTPUT_DIRECTORY: /app/instance/outputs test: @@ -14,4 +15,5 @@ services: context: . target: test environment: + MAX_UPLOAD_SIZE_MB: "100" OUTPUT_DIRECTORY: /app/instance/outputs diff --git a/tests/conftest.py b/tests/conftest.py index ebcce3d..59c868c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from app import create_app class TestConfig: TESTING = True SECRET_KEY = "test-secret" - MAX_CONTENT_LENGTH = 1024 * 1024 + MAX_CONTENT_LENGTH = 100 * 1024 * 1024 PREVIEW_RECORD_LIMIT = 5 OUTPUT_DIRECTORY = "test-outputs" diff --git a/tests/test_app.py b/tests/test_app.py index f9d672b..6351c21 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,5 +1,7 @@ import io +from app import create_app + SAMPLE_LOG = ( 'v015xxxxdate=2024-05-01 time=10:00:00 policy="Prod Policy" ' @@ -23,6 +25,7 @@ def test_index_page_loads(client): def test_convert_returns_text_preview_and_download_link(client): + log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8")) response = client.post( "/convert", data={ @@ -34,17 +37,20 @@ def test_convert_returns_text_preview_and_download_link(client): "policy_ci": "prod", "severity_cs": "", "severity_ci": "", - "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"), + "log_file": (log_file, "sample.log"), }, content_type="multipart/form-data", ) + log_file.close() assert response.status_code == 200 assert b"Download export" in response.data assert b"--- record 1 ---" in response.data + response.close() def test_convert_full_mode_csv_preserves_union_order(client): + log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8")) response = client.post( "/convert", data={ @@ -56,17 +62,20 @@ def test_convert_full_mode_csv_preserves_union_order(client): "policy_ci": "", "severity_cs": "", "severity_ci": "", - "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"), + "log_file": (log_file, "sample.log"), }, content_type="multipart/form-data", ) + log_file.close() assert response.status_code == 200 assert b"TEXT" not in response.data assert b"Download export" in response.data + response.close() def test_convert_rejects_mutually_exclusive_filters(client): + log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8")) response = client.post( "/convert", data={ @@ -78,16 +87,19 @@ def test_convert_rejects_mutually_exclusive_filters(client): "policy_ci": "a", "severity_cs": "", "severity_ci": "", - "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"), + "log_file": (log_file, "sample.log"), }, content_type="multipart/form-data", ) + log_file.close() assert response.status_code == 400 assert b"Policy filter must use either case-sensitive or case-insensitive match" in response.data + response.close() def test_download_route_returns_generated_file(client): + log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8")) convert_response = client.post( "/convert", data={ @@ -99,10 +111,11 @@ def test_download_route_returns_generated_file(client): "policy_ci": "", "severity_cs": "", "severity_ci": "", - "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"), + "log_file": (log_file, "sample.log"), }, content_type="multipart/form-data", ) + log_file.close() html = convert_response.data.decode("utf-8") marker = '/download/' @@ -115,4 +128,43 @@ def test_download_route_returns_generated_file(client): assert download_response.status_code == 200 assert download_response.headers["Content-Type"].startswith("text/csv") assert b"v015xxxxdate,time,policy" in download_response.data + convert_response.close() download_response.close() + + +def test_default_upload_limit_is_100_mib(app): + assert app.config["MAX_CONTENT_LENGTH"] == 100 * 1024 * 1024 + + +def test_too_large_upload_returns_friendly_message(tmp_path): + class SmallLimitConfig: + TESTING = True + SECRET_KEY = "test-secret" + MAX_CONTENT_LENGTH = 128 + PREVIEW_RECORD_LIMIT = 5 + OUTPUT_DIRECTORY = tmp_path / "tiny-limit-outputs" + + app = create_app(SmallLimitConfig) + client = app.test_client() + log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8")) + + response = client.post( + "/convert", + data={ + "mode": "vendor", + "output_format": "text", + "sort_by": "datetime", + "order": "asc", + "policy_cs": "", + "policy_ci": "", + "severity_cs": "", + "severity_ci": "", + "log_file": (log_file, "sample.log"), + }, + content_type="multipart/form-data", + ) + log_file.close() + + assert response.status_code == 413 + assert b"Maximum allowed size is 128 bytes." in response.data + response.close() diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..9c43692 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,15 @@ +from app.config import _get_max_content_length + + +def test_max_upload_size_mb_environment_variable(monkeypatch): + monkeypatch.setenv("MAX_UPLOAD_SIZE_MB", "42") + monkeypatch.delenv("MAX_CONTENT_LENGTH", raising=False) + + assert _get_max_content_length() == 42 * 1024 * 1024 + + +def test_max_content_length_environment_variable_is_supported(monkeypatch): + monkeypatch.delenv("MAX_UPLOAD_SIZE_MB", raising=False) + monkeypatch.setenv("MAX_CONTENT_LENGTH", "2048") + + assert _get_max_content_length() == 2048 diff --git a/tests/test_parser.py b/tests/test_parser.py index 52c6a89..dda028d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -28,3 +28,33 @@ def test_parse_log_file_rejects_tokens_without_equals(): with pytest.raises(LogParseError): parse_log_file(stream) + + +def test_parse_log_file_supports_utf8_bom(): + stream = io.BytesIO( + b'\xef\xbb\xbfv015xxxxdate=2024-02-15 time=09:10:11 msg="blocked request"\n' + ) + + records, _union_keys = parse_log_file(stream) + + assert records[0]["v015xxxxdate"] == "2024-02-15" + + +def test_parse_log_file_supports_cp1252_text(): + stream = io.BytesIO( + 'v015xxxxdate=2024-02-15 time=09:10:11 msg="caf\xe9 request"\n'.encode("cp1252") + ) + + records, _union_keys = parse_log_file(stream) + + assert records[0]["msg"] == "cafe request".replace("e", "é", 1) + + +def test_parse_log_file_tolerates_unterminated_quotes(): + stream = io.BytesIO( + b'v015xxxxdate=2024-02-15 time=09:10:11 msg="broken quoted value\n' + ) + + records, _union_keys = parse_log_file(stream) + + assert records[0]["msg"] == "broken quoted value"