webfortilog/app/services/parser.py

import codecs
from collections import OrderedDict
from dataclasses import dataclass, field
from io import BufferedIOBase, TextIOBase
import re

RECORD_PREFIX = "v015xxxxdate="
KEY_PATTERN = re.compile(r"(?:(?<=\s)|^)([A-Za-z_][A-Za-z0-9_]*)=")


class LogParseError(ValueError):
    """Raised when the uploaded log file cannot be parsed."""


@dataclass(slots=True)
class ParseSession:
    """Stateful streamed parser for uploaded log files."""

    stream: BufferedIOBase | TextIOBase
    encoding: str | None
    _union_keys: OrderedDict[str, None] = field(default_factory=OrderedDict)
    parsed_count: int = 0
    _consumed: bool = False

    def iter_records(self):
        if self._consumed:
            raise RuntimeError("ParseSession records can only be consumed once.")

        self._consumed = True
        for line_number, line in _iter_logical_records(_iter_physical_lines(self.stream, self.encoding)):
            record = _parse_record(line, line_number)
            for key in record:
                self._union_keys.setdefault(key, None)
            self.parsed_count += 1
            yield record

    def union_keys(self) -> list[str]:
        return list(self._union_keys.keys())


def create_parse_session(stream: BufferedIOBase | TextIOBase) -> ParseSession:
    """Prepare a streamed parser session without materializing the full upload in memory."""
    return ParseSession(stream=stream, encoding=_resolve_stream_encoding(stream))


def _normalize_value(value: str) -> str:
    """Remove balanced shell-style quotes while tolerating malformed values."""
    value = value.strip()
    if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
        return value[1:-1]
    if value[:1] in {'"', "'"}:
        return value[1:]
    return value


def _resolve_stream_encoding(stream: BufferedIOBase | TextIOBase) -> str | None:
    """Detect the most suitable stream encoding without reading the full file into memory."""
    probe = stream.read(0)
    if isinstance(probe, str):
        return None

    for encoding in ("utf-8-sig", "cp1252", "latin-1"):
        try:
            _validate_stream_encoding(stream, encoding)
            return encoding
        except UnicodeDecodeError:
            continue

    raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.")


def _validate_stream_encoding(stream: BufferedIOBase | TextIOBase, encoding: str) -> None:
    """Scan the stream to verify that the candidate encoding can decode it fully."""
    _rewind_stream(stream)
    decoder = codecs.getincrementaldecoder(encoding)()
    for chunk in iter(lambda: stream.read(64 * 1024), b""):
        decoder.decode(chunk, final=False)
    decoder.decode(b"", final=True)
    _rewind_stream(stream)


def _iter_physical_lines(
    stream: BufferedIOBase | TextIOBase,
    encoding: str | None,
):
    """Yield decoded physical lines from the uploaded stream without full-file buffering."""
    _rewind_stream(stream)

    if encoding is None:
        for line_number, raw_line in enumerate(stream, start=1):
            yield line_number, raw_line
        return

    line_number = 1
    decoder = codecs.getincrementaldecoder(encoding)()
    pending = ""
    for chunk in iter(lambda: stream.read(64 * 1024), b""):
        text = decoder.decode(chunk, final=False)
        pending += text
        while True:
            newline_index = pending.find("\n")
            if newline_index == -1:
                break
            line = pending[: newline_index + 1]
            pending = pending[newline_index + 1 :]
            yield line_number, line
            line_number += 1

    pending += decoder.decode(b"", final=True)
    while True:
        newline_index = pending.find("\n")
        if newline_index == -1:
            break
        line = pending[: newline_index + 1]
        pending = pending[newline_index + 1 :]
        yield line_number, line
        line_number += 1

    if pending:
        yield line_number, pending


def _rewind_stream(stream: BufferedIOBase | TextIOBase) -> None:
    """Move the uploaded stream back to the start."""
    if not hasattr(stream, "seek"):
        raise LogParseError("The uploaded file stream is not seekable.")
    stream.seek(0)


def _parse_record(line: str, line_number: int) -> dict[str, str]:
    """Parse a logical record by locating `key=` boundaries instead of splitting on spaces."""
    matches = list(KEY_PATTERN.finditer(line))
    if not matches:
        raise LogParseError(f"Line {line_number}: no key=value pairs were found.")

    record: dict[str, str] = {}
    for index, match in enumerate(matches):
        key = match.group(1)
        value_start = match.end()
        value_end = matches[index + 1].start() if index + 1 < len(matches) else len(line)
        raw_value = line[value_start:value_end].strip()
        if raw_value and raw_value[:1] not in {'"', "'"} and any(
            char.isspace() for char in raw_value
        ):
            raise LogParseError(
                f"Line {line_number}: invalid unquoted value for key '{key}'."
            )
        value = _normalize_value(raw_value)
        record[key] = value

    return record


def _iter_logical_records(physical_lines):
    """Rebuild logical records when embedded newlines split a single log entry."""
    current_record: list[str] = []
    current_start_line: int | None = None

    for line_number, raw_line in physical_lines:
        line = raw_line.strip()
        if not line:
            continue

        if line.startswith(RECORD_PREFIX):
            if current_record and current_start_line is not None:
                yield current_start_line, "".join(current_record)
            current_record = [line]
            current_start_line = line_number
            continue

        if current_record:
            current_record.append(line)
            continue

        raise LogParseError(
            f"Line {line_number}: unexpected content before the first log record."
        )

    if current_record and current_start_line is not None:
        yield current_start_line, "".join(current_record)


def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
    """Compatibility helper that still materializes all parsed records when needed."""
    session = create_parse_session(stream)
    records = list(session.iter_records())
    return records, session.union_keys()