Files
webfortilog/app/services/parser.py
2026-04-24 15:12:51 +02:00

109 lines
3.6 KiB
Python

from collections import OrderedDict
from io import BufferedIOBase, TextIOBase
import re
RECORD_PREFIX = "v015xxxxdate="
KEY_PATTERN = re.compile(r"(?:(?<=\s)|^)([A-Za-z_][A-Za-z0-9_]*)=")
class LogParseError(ValueError):
"""Raised when the uploaded log file cannot be parsed."""
def _decode_log_content(raw_bytes: bytes | str) -> str:
"""Decode uploaded log content using practical text encodings seen in exports."""
if isinstance(raw_bytes, str):
return raw_bytes
for encoding in ("utf-8-sig", "cp1252", "latin-1"):
try:
return raw_bytes.decode(encoding)
except UnicodeDecodeError:
continue
raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.")
def _normalize_value(value: str) -> str:
"""Remove balanced shell-style quotes while tolerating malformed values."""
value = value.strip()
if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
return value[1:-1]
if value[:1] in {'"', "'"}:
return value[1:]
return value
def _parse_record(line: str, line_number: int) -> dict[str, str]:
"""Parse a logical record by locating `key=` boundaries instead of splitting on spaces."""
matches = list(KEY_PATTERN.finditer(line))
if not matches:
raise LogParseError(f"Line {line_number}: no key=value pairs were found.")
record: dict[str, str] = {}
for index, match in enumerate(matches):
key = match.group(1)
value_start = match.end()
value_end = matches[index + 1].start() if index + 1 < len(matches) else len(line)
raw_value = line[value_start:value_end].strip()
if raw_value and raw_value[:1] not in {'"', "'"} and any(
char.isspace() for char in raw_value
):
raise LogParseError(
f"Line {line_number}: invalid unquoted value for key '{key}'."
)
value = _normalize_value(raw_value)
record[key] = value
return record
def _iter_logical_records(content: str) -> list[tuple[int, str]]:
"""Rebuild logical records when embedded newlines split a single log entry."""
records: list[tuple[int, str]] = []
current_record: list[str] = []
current_start_line: int | None = None
for line_number, raw_line in enumerate(content.splitlines(), start=1):
line = raw_line.strip()
if not line:
continue
if line.startswith(RECORD_PREFIX):
if current_record and current_start_line is not None:
records.append((current_start_line, "".join(current_record)))
current_record = [line]
current_start_line = line_number
continue
if current_record:
current_record.append(line)
continue
raise LogParseError(
f"Line {line_number}: unexpected content before the first log record."
)
if current_record and current_start_line is not None:
records.append((current_start_line, "".join(current_record)))
return records
def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
"""Parse a text log file where each line contains shell-like key/value tokens."""
raw_bytes = stream.read()
content = _decode_log_content(raw_bytes)
records: list[dict[str, str]] = []
seen_keys: OrderedDict[str, None] = OrderedDict()
for line_number, line in _iter_logical_records(content):
record = _parse_record(line, line_number)
for key in record:
seen_keys.setdefault(key, None)
records.append(record)
return records, list(seen_keys.keys())