Improve log upload handling

2026-04-24 15:00:43 +02:00
parent e793b51e4f
commit f64deb9c0d
10 changed files with 237 additions and 20 deletions
--- a/README.md
+++ b/README.md
@@ -42,11 +42,20 @@ pip install -e ".[dev]"

 ```bash
 export FLASK_APP=wsgi.py
+export MAX_UPLOAD_SIZE_MB=100
 flask run --debug
 ```

 Open `http://127.0.0.1:5000`.

+### Example input file
+
+If you have a local WAF export such as `attack_download.log`, you can use it as a real example upload.
+
+- Example file: `attack_download.log`
+- Approximate size in the current workspace: `98.5 MiB`
+- The default `MAX_UPLOAD_SIZE_MB=100` setting is sized to accept a file of that size
+
 ### Test

 ```bash
@@ -64,7 +73,7 @@ docker build -t webfortilog .
 ### Run

 ```bash
-docker run --rm -p 8000:8000 webfortilog
+docker run --rm -p 8000:8000 -e MAX_UPLOAD_SIZE_MB=100 webfortilog
 ```

 Open `http://127.0.0.1:8000`.
@@ -83,8 +92,36 @@ docker compose up --build web
 docker compose run --rm test
 ```

+## Example usage
+
+### Browser upload
+
+1. Start the app with `flask run --debug` or `docker compose up --build web`
+2. Open the web UI
+3. Upload `attack_download.log`
+4. Try `vendor` mode with `text` output for a readable preview
+5. Try `full` mode with `csv` output for complete export coverage
+
+### Command-line upload example
+
+```bash
+curl -X POST http://127.0.0.1:5000/convert \
+  -F "log_file=@attack_download.log" \
+  -F "mode=vendor" \
+  -F "output_format=text" \
+  -F "sort_by=datetime" \
+  -F "order=asc" \
+  -F "policy_cs=" \
+  -F "policy_ci=" \
+  -F "severity_cs=" \
+  -F "severity_ci="
+```
+
 ## Notes

 - Temporary output files are written to `instance/outputs`
 - The application does not require a database
 - Gunicorn is used as the production WSGI server
+- Default upload limit is 100 MiB
+- Set `MAX_UPLOAD_SIZE_MB` to configure the upload limit in megabytes
+- `MAX_CONTENT_LENGTH` is also supported as a lower-level byte-based override
--- a/app/init.py
+++ b/app/init.py
@@ -7,6 +7,15 @@ from app.config import Config
 from app.routes import main_blueprint


+def _format_size_limit(size_limit_bytes: int) -> str:
+    """Render the upload limit in a friendly unit for error messages."""
+    if size_limit_bytes >= 1024 * 1024:
+        return f"{size_limit_bytes / (1024 * 1024):.0f} MB"
+    if size_limit_bytes >= 1024:
+        return f"{size_limit_bytes / 1024:.0f} KB"
+    return f"{size_limit_bytes} bytes"
+
+
 def create_app(config_class: type[Config] = Config) -> Flask:
    """Application factory used by Flask and Gunicorn."""
    app = Flask(__name__, instance_relative_config=True)
@@ -22,7 +31,11 @@ def create_app(config_class: type[Config] = Config) -> Flask:

    @app.errorhandler(RequestEntityTooLarge)
    def handle_file_too_large(_error):
-        flash("The uploaded file is too large.", "danger")
+        size_limit_bytes = int(app.config["MAX_CONTENT_LENGTH"])
+        flash(
+            f"The uploaded file is too large. Maximum allowed size is {_format_size_limit(size_limit_bytes)}.",
+            "danger",
+        )
        return render_template("index.html"), 413

    return app
--- a/app/config.py
+++ b/app/config.py
@@ -2,11 +2,25 @@ import os
 from pathlib import Path


+def _get_max_content_length() -> int:
+    """Resolve the upload size limit from environment settings."""
+    upload_limit_mb = os.environ.get("MAX_UPLOAD_SIZE_MB")
+    if upload_limit_mb:
+        return int(upload_limit_mb) * 1024 * 1024
+
+    max_content_length = os.environ.get("MAX_CONTENT_LENGTH")
+    if max_content_length:
+        return int(max_content_length)
+
+    return 100 * 1024 * 1024
+
+
 class Config:
    """Default configuration for local and container usage."""

    SECRET_KEY = os.environ.get("SECRET_KEY", "dev-secret-key-change-me")
-    MAX_CONTENT_LENGTH = int(os.environ.get("MAX_CONTENT_LENGTH", 10 * 1024 * 1024))
+    # Default to 100 MiB so larger WAF exports can be processed without tuning.
+    MAX_CONTENT_LENGTH = _get_max_content_length()
    PREVIEW_RECORD_LIMIT = int(os.environ.get("PREVIEW_RECORD_LIMIT", 5))
    OUTPUT_DIRECTORY = Path(
        os.environ.get("OUTPUT_DIRECTORY", Path("instance") / "outputs")
--- a/app/routes.py
+++ b/app/routes.py
@@ -116,7 +116,10 @@ def convert():
        flash(str(exc), "danger")
        return render_template("index.html", form=form), 400
    except UnicodeDecodeError:
-        flash("The uploaded file is not valid UTF-8 text.", "danger")
+        flash(
+            "The uploaded file could not be decoded. Supported encodings are UTF-8, UTF-8 with BOM, Windows-1252, and Latin-1.",
+            "danger",
+        )
        return render_template("index.html", form=form), 400

    preview_limit = current_app.config["PREVIEW_RECORD_LIMIT"]
--- a/app/services/parser.py
+++ b/app/services/parser.py
@@ -1,4 +1,3 @@
-import shlex
 from collections import OrderedDict
 from io import BufferedIOBase, TextIOBase

@@ -7,13 +6,68 @@ class LogParseError(ValueError):
    """Raised when the uploaded log file cannot be parsed."""


-def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
-    """Parse a UTF-8 log file where each line contains shell-like key/value tokens."""
-    raw_bytes = stream.read()
+def _decode_log_content(raw_bytes: bytes | str) -> str:
+    """Decode uploaded log content using practical text encodings seen in exports."""
    if isinstance(raw_bytes, str):
-        content = raw_bytes
+        return raw_bytes
+
+    for encoding in ("utf-8-sig", "cp1252", "latin-1"):
+        try:
+            return raw_bytes.decode(encoding)
+        except UnicodeDecodeError:
+            continue
+
+    raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.")
+
+
+def _tokenize_line(line: str) -> list[str]:
+    """Split a line using shell-like rules while tolerating unmatched trailing quotes."""
+    tokens: list[str] = []
+    current: list[str] = []
+    quote_char: str | None = None
+    escape_next = False
+
+    for char in line:
+        if escape_next:
+            current.append(char)
+            escape_next = False
+            continue
+
+        if char == "\\":
+            escape_next = True
+            continue
+
+        if quote_char is not None:
+            if char == quote_char:
+                quote_char = None
            else:
-        content = raw_bytes.decode("utf-8")
+                current.append(char)
+            continue
+
+        if char in {'"', "'"}:
+            quote_char = char
+            continue
+
+        if char.isspace():
+            if current:
+                tokens.append("".join(current))
+                current = []
+            continue
+
+        current.append(char)
+
+    if escape_next:
+        current.append("\\")
+    if current:
+        tokens.append("".join(current))
+
+    return tokens
+
+
+def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
+    """Parse a text log file where each line contains shell-like key/value tokens."""
+    raw_bytes = stream.read()
+    content = _decode_log_content(raw_bytes)

    records: list[dict[str, str]] = []
    seen_keys: OrderedDict[str, None] = OrderedDict()
@@ -23,10 +77,7 @@ def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str,
        if not line:
            continue

-        try:
-            tokens = shlex.split(line, posix=True)
-        except ValueError as exc:
-            raise LogParseError(f"Line {line_number}: invalid shell-style quoting.") from exc
+        tokens = _tokenize_line(line)

        record: dict[str, str] = {}
        for token in tokens:
--- a/compose.yaml
+++ b/compose.yaml
@@ -7,6 +7,7 @@ services:
      - "8000:8000"
    environment:
      SECRET_KEY: change-me
+      MAX_UPLOAD_SIZE_MB: "100"
      OUTPUT_DIRECTORY: /app/instance/outputs

  test:
@@ -14,4 +15,5 @@ services:
      context: .
      target: test
    environment:
+      MAX_UPLOAD_SIZE_MB: "100"
      OUTPUT_DIRECTORY: /app/instance/outputs
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,7 +9,7 @@ from app import create_app
 class TestConfig:
    TESTING = True
    SECRET_KEY = "test-secret"
-    MAX_CONTENT_LENGTH = 1024 * 1024
+    MAX_CONTENT_LENGTH = 100 * 1024 * 1024
    PREVIEW_RECORD_LIMIT = 5
    OUTPUT_DIRECTORY = "test-outputs"

--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -1,5 +1,7 @@
 import io

+from app import create_app
+

 SAMPLE_LOG = (
    'v015xxxxdate=2024-05-01 time=10:00:00 policy="Prod Policy" '
@@ -23,6 +25,7 @@ def test_index_page_loads(client):


 def test_convert_returns_text_preview_and_download_link(client):
+    log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8"))
    response = client.post(
        "/convert",
        data={
@@ -34,17 +37,20 @@ def test_convert_returns_text_preview_and_download_link(client):
            "policy_ci": "prod",
            "severity_cs": "",
            "severity_ci": "",
-            "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"),
+            "log_file": (log_file, "sample.log"),
        },
        content_type="multipart/form-data",
    )
+    log_file.close()

    assert response.status_code == 200
    assert b"Download export" in response.data
    assert b"--- record 1 ---" in response.data
+    response.close()


 def test_convert_full_mode_csv_preserves_union_order(client):
+    log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8"))
    response = client.post(
        "/convert",
        data={
@@ -56,17 +62,20 @@ def test_convert_full_mode_csv_preserves_union_order(client):
            "policy_ci": "",
            "severity_cs": "",
            "severity_ci": "",
-            "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"),
+            "log_file": (log_file, "sample.log"),
        },
        content_type="multipart/form-data",
    )
+    log_file.close()

    assert response.status_code == 200
    assert b"TEXT" not in response.data
    assert b"Download export" in response.data
+    response.close()


 def test_convert_rejects_mutually_exclusive_filters(client):
+    log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8"))
    response = client.post(
        "/convert",
        data={
@@ -78,16 +87,19 @@ def test_convert_rejects_mutually_exclusive_filters(client):
            "policy_ci": "a",
            "severity_cs": "",
            "severity_ci": "",
-            "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"),
+            "log_file": (log_file, "sample.log"),
        },
        content_type="multipart/form-data",
    )
+    log_file.close()

    assert response.status_code == 400
    assert b"Policy filter must use either case-sensitive or case-insensitive match" in response.data
+    response.close()


 def test_download_route_returns_generated_file(client):
+    log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8"))
    convert_response = client.post(
        "/convert",
        data={
@@ -99,10 +111,11 @@ def test_download_route_returns_generated_file(client):
            "policy_ci": "",
            "severity_cs": "",
            "severity_ci": "",
-            "log_file": (io.BytesIO(SAMPLE_LOG.encode("utf-8")), "sample.log"),
+            "log_file": (log_file, "sample.log"),
        },
        content_type="multipart/form-data",
    )
+    log_file.close()

    html = convert_response.data.decode("utf-8")
    marker = '/download/'
@@ -115,4 +128,43 @@ def test_download_route_returns_generated_file(client):
    assert download_response.status_code == 200
    assert download_response.headers["Content-Type"].startswith("text/csv")
    assert b"v015xxxxdate,time,policy" in download_response.data
+    convert_response.close()
    download_response.close()
+
+
+def test_default_upload_limit_is_100_mib(app):
+    assert app.config["MAX_CONTENT_LENGTH"] == 100 * 1024 * 1024
+
+
+def test_too_large_upload_returns_friendly_message(tmp_path):
+    class SmallLimitConfig:
+        TESTING = True
+        SECRET_KEY = "test-secret"
+        MAX_CONTENT_LENGTH = 128
+        PREVIEW_RECORD_LIMIT = 5
+        OUTPUT_DIRECTORY = tmp_path / "tiny-limit-outputs"
+
+    app = create_app(SmallLimitConfig)
+    client = app.test_client()
+    log_file = io.BytesIO(SAMPLE_LOG.encode("utf-8"))
+
+    response = client.post(
+        "/convert",
+        data={
+            "mode": "vendor",
+            "output_format": "text",
+            "sort_by": "datetime",
+            "order": "asc",
+            "policy_cs": "",
+            "policy_ci": "",
+            "severity_cs": "",
+            "severity_ci": "",
+            "log_file": (log_file, "sample.log"),
+        },
+        content_type="multipart/form-data",
+    )
+    log_file.close()
+
+    assert response.status_code == 413
+    assert b"Maximum allowed size is 128 bytes." in response.data
+    response.close()
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -0,0 +1,15 @@
+from app.config import _get_max_content_length
+
+
+def test_max_upload_size_mb_environment_variable(monkeypatch):
+    monkeypatch.setenv("MAX_UPLOAD_SIZE_MB", "42")
+    monkeypatch.delenv("MAX_CONTENT_LENGTH", raising=False)
+
+    assert _get_max_content_length() == 42 * 1024 * 1024
+
+
+def test_max_content_length_environment_variable_is_supported(monkeypatch):
+    monkeypatch.delenv("MAX_UPLOAD_SIZE_MB", raising=False)
+    monkeypatch.setenv("MAX_CONTENT_LENGTH", "2048")
+
+    assert _get_max_content_length() == 2048
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -28,3 +28,33 @@ def test_parse_log_file_rejects_tokens_without_equals():

    with pytest.raises(LogParseError):
        parse_log_file(stream)
+
+
+def test_parse_log_file_supports_utf8_bom():
+    stream = io.BytesIO(
+        b'\xef\xbb\xbfv015xxxxdate=2024-02-15 time=09:10:11 msg="blocked request"\n'
+    )
+
+    records, _union_keys = parse_log_file(stream)
+
+    assert records[0]["v015xxxxdate"] == "2024-02-15"
+
+
+def test_parse_log_file_supports_cp1252_text():
+    stream = io.BytesIO(
+        'v015xxxxdate=2024-02-15 time=09:10:11 msg="caf\xe9 request"\n'.encode("cp1252")
+    )
+
+    records, _union_keys = parse_log_file(stream)
+
+    assert records[0]["msg"] == "cafe request".replace("e", "é", 1)
+
+
+def test_parse_log_file_tolerates_unterminated_quotes():
+    stream = io.BytesIO(
+        b'v015xxxxdate=2024-02-15 time=09:10:11 msg="broken quoted value\n'
+    )
+
+    records, _union_keys = parse_log_file(stream)
+
+    assert records[0]["msg"] == "broken quoted value"