import codecs from collections import OrderedDict from dataclasses import dataclass, field from io import BufferedIOBase, TextIOBase import re RECORD_PREFIX = "v015xxxxdate=" KEY_PATTERN = re.compile(r"(?:(?<=\s)|^)([A-Za-z_][A-Za-z0-9_]*)=") class LogParseError(ValueError): """Raised when the uploaded log file cannot be parsed.""" @dataclass(slots=True) class ParseSession: """Stateful streamed parser for uploaded log files.""" stream: BufferedIOBase | TextIOBase encoding: str | None _union_keys: OrderedDict[str, None] = field(default_factory=OrderedDict) parsed_count: int = 0 _consumed: bool = False def iter_records(self): if self._consumed: raise RuntimeError("ParseSession records can only be consumed once.") self._consumed = True for line_number, line in _iter_logical_records(_iter_physical_lines(self.stream, self.encoding)): record = _parse_record(line, line_number) for key in record: self._union_keys.setdefault(key, None) self.parsed_count += 1 yield record def union_keys(self) -> list[str]: return list(self._union_keys.keys()) def create_parse_session(stream: BufferedIOBase | TextIOBase) -> ParseSession: """Prepare a streamed parser session without materializing the full upload in memory.""" return ParseSession(stream=stream, encoding=_resolve_stream_encoding(stream)) def _normalize_value(value: str) -> str: """Remove balanced shell-style quotes while tolerating malformed values.""" value = value.strip() if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}: return value[1:-1] if value[:1] in {'"', "'"}: return value[1:] return value def _resolve_stream_encoding(stream: BufferedIOBase | TextIOBase) -> str | None: """Detect the most suitable stream encoding without reading the full file into memory.""" probe = stream.read(0) if isinstance(probe, str): return None for encoding in ("utf-8-sig", "cp1252", "latin-1"): try: _validate_stream_encoding(stream, encoding) return encoding except UnicodeDecodeError: continue raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.") def _validate_stream_encoding(stream: BufferedIOBase | TextIOBase, encoding: str) -> None: """Scan the stream to verify that the candidate encoding can decode it fully.""" _rewind_stream(stream) decoder = codecs.getincrementaldecoder(encoding)() for chunk in iter(lambda: stream.read(64 * 1024), b""): decoder.decode(chunk, final=False) decoder.decode(b"", final=True) _rewind_stream(stream) def _iter_physical_lines( stream: BufferedIOBase | TextIOBase, encoding: str | None, ): """Yield decoded physical lines from the uploaded stream without full-file buffering.""" _rewind_stream(stream) if encoding is None: for line_number, raw_line in enumerate(stream, start=1): yield line_number, raw_line return line_number = 1 decoder = codecs.getincrementaldecoder(encoding)() pending = "" for chunk in iter(lambda: stream.read(64 * 1024), b""): text = decoder.decode(chunk, final=False) pending += text while True: newline_index = pending.find("\n") if newline_index == -1: break line = pending[: newline_index + 1] pending = pending[newline_index + 1 :] yield line_number, line line_number += 1 pending += decoder.decode(b"", final=True) while True: newline_index = pending.find("\n") if newline_index == -1: break line = pending[: newline_index + 1] pending = pending[newline_index + 1 :] yield line_number, line line_number += 1 if pending: yield line_number, pending def _rewind_stream(stream: BufferedIOBase | TextIOBase) -> None: """Move the uploaded stream back to the start.""" if not hasattr(stream, "seek"): raise LogParseError("The uploaded file stream is not seekable.") stream.seek(0) def _parse_record(line: str, line_number: int) -> dict[str, str]: """Parse a logical record by locating `key=` boundaries instead of splitting on spaces.""" matches = list(KEY_PATTERN.finditer(line)) if not matches: raise LogParseError(f"Line {line_number}: no key=value pairs were found.") record: dict[str, str] = {} for index, match in enumerate(matches): key = match.group(1) value_start = match.end() value_end = matches[index + 1].start() if index + 1 < len(matches) else len(line) raw_value = line[value_start:value_end].strip() if raw_value and raw_value[:1] not in {'"', "'"} and any( char.isspace() for char in raw_value ): raise LogParseError( f"Line {line_number}: invalid unquoted value for key '{key}'." ) value = _normalize_value(raw_value) record[key] = value return record def _iter_logical_records(physical_lines): """Rebuild logical records when embedded newlines split a single log entry.""" current_record: list[str] = [] current_start_line: int | None = None for line_number, raw_line in physical_lines: line = raw_line.strip() if not line: continue if line.startswith(RECORD_PREFIX): if current_record and current_start_line is not None: yield current_start_line, "".join(current_record) current_record = [line] current_start_line = line_number continue if current_record: current_record.append(line) continue raise LogParseError( f"Line {line_number}: unexpected content before the first log record." ) if current_record and current_start_line is not None: yield current_start_line, "".join(current_record) def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]: """Compatibility helper that still materializes all parsed records when needed.""" session = create_parse_session(stream) records = list(session.iter_records()) return records, session.union_keys()