Improve log upload handling

2026-04-24 15:00:43 +02:00
parent e793b51e4f
commit f64deb9c0d
10 changed files with 237 additions and 20 deletions
--- a/app/services/parser.py
+++ b/app/services/parser.py
@@ -1,4 +1,3 @@
-import shlex
 from collections import OrderedDict
 from io import BufferedIOBase, TextIOBase

@@ -7,13 +6,68 @@ class LogParseError(ValueError):
    """Raised when the uploaded log file cannot be parsed."""


-def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
-    """Parse a UTF-8 log file where each line contains shell-like key/value tokens."""
-    raw_bytes = stream.read()
+def _decode_log_content(raw_bytes: bytes | str) -> str:
+    """Decode uploaded log content using practical text encodings seen in exports."""
    if isinstance(raw_bytes, str):
-        content = raw_bytes
-    else:
-        content = raw_bytes.decode("utf-8")
+        return raw_bytes
+
+    for encoding in ("utf-8-sig", "cp1252", "latin-1"):
+        try:
+            return raw_bytes.decode(encoding)
+        except UnicodeDecodeError:
+            continue
+
+    raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.")
+
+
+def _tokenize_line(line: str) -> list[str]:
+    """Split a line using shell-like rules while tolerating unmatched trailing quotes."""
+    tokens: list[str] = []
+    current: list[str] = []
+    quote_char: str | None = None
+    escape_next = False
+
+    for char in line:
+        if escape_next:
+            current.append(char)
+            escape_next = False
+            continue
+
+        if char == "\\":
+            escape_next = True
+            continue
+
+        if quote_char is not None:
+            if char == quote_char:
+                quote_char = None
+            else:
+                current.append(char)
+            continue
+
+        if char in {'"', "'"}:
+            quote_char = char
+            continue
+
+        if char.isspace():
+            if current:
+                tokens.append("".join(current))
+                current = []
+            continue
+
+        current.append(char)
+
+    if escape_next:
+        current.append("\\")
+    if current:
+        tokens.append("".join(current))
+
+    return tokens
+
+
+def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
+    """Parse a text log file where each line contains shell-like key/value tokens."""
+    raw_bytes = stream.read()
+    content = _decode_log_content(raw_bytes)

    records: list[dict[str, str]] = []
    seen_keys: OrderedDict[str, None] = OrderedDict()
@@ -23,10 +77,7 @@ def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str,
        if not line:
            continue

-        try:
-            tokens = shlex.split(line, posix=True)
-        except ValueError as exc:
-            raise LogParseError(f"Line {line_number}: invalid shell-style quoting.") from exc
+        tokens = _tokenize_line(line)

        record: dict[str, str] = {}
        for token in tokens: