webfortilog/app/services/parser.py

from collections import OrderedDict
from io import BufferedIOBase, TextIOBase
import re

RECORD_PREFIX = "v015xxxxdate="
KEY_PATTERN = re.compile(r"(?:(?<=\s)|^)([A-Za-z_][A-Za-z0-9_]*)=")


class LogParseError(ValueError):
    """Raised when the uploaded log file cannot be parsed."""


def _decode_log_content(raw_bytes: bytes | str) -> str:
    """Decode uploaded log content using practical text encodings seen in exports."""
    if isinstance(raw_bytes, str):
        return raw_bytes

    for encoding in ("utf-8-sig", "cp1252", "latin-1"):
        try:
            return raw_bytes.decode(encoding)
        except UnicodeDecodeError:
            continue

    raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.")


def _normalize_value(value: str) -> str:
    """Remove balanced shell-style quotes while tolerating malformed values."""
    value = value.strip()
    if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
        return value[1:-1]
    if value[:1] in {'"', "'"}:
        return value[1:]
    return value


def _parse_record(line: str, line_number: int) -> dict[str, str]:
    """Parse a logical record by locating `key=` boundaries instead of splitting on spaces."""
    matches = list(KEY_PATTERN.finditer(line))
    if not matches:
        raise LogParseError(f"Line {line_number}: no key=value pairs were found.")

    record: dict[str, str] = {}
    for index, match in enumerate(matches):
        key = match.group(1)
        value_start = match.end()
        value_end = matches[index + 1].start() if index + 1 < len(matches) else len(line)
        raw_value = line[value_start:value_end].strip()
        if raw_value and raw_value[:1] not in {'"', "'"} and any(
            char.isspace() for char in raw_value
        ):
            raise LogParseError(
                f"Line {line_number}: invalid unquoted value for key '{key}'."
            )
        value = _normalize_value(raw_value)
        record[key] = value

    return record


def _iter_logical_records(content: str) -> list[tuple[int, str]]:
    """Rebuild logical records when embedded newlines split a single log entry."""
    records: list[tuple[int, str]] = []
    current_record: list[str] = []
    current_start_line: int | None = None

    for line_number, raw_line in enumerate(content.splitlines(), start=1):
        line = raw_line.strip()
        if not line:
            continue

        if line.startswith(RECORD_PREFIX):
            if current_record and current_start_line is not None:
                records.append((current_start_line, "".join(current_record)))
            current_record = [line]
            current_start_line = line_number
            continue

        if current_record:
            current_record.append(line)
            continue

        raise LogParseError(
            f"Line {line_number}: unexpected content before the first log record."
        )

    if current_record and current_start_line is not None:
        records.append((current_start_line, "".join(current_record)))

    return records


def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
    """Parse a text log file where each line contains shell-like key/value tokens."""
    raw_bytes = stream.read()
    content = _decode_log_content(raw_bytes)

    records: list[dict[str, str]] = []
    seen_keys: OrderedDict[str, None] = OrderedDict()

    for line_number, line in _iter_logical_records(content):
        record = _parse_record(line, line_number)
        for key in record:
            seen_keys.setdefault(key, None)

        records.append(record)

    return records, list(seen_keys.keys())