Reduce conversion memory footprint
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
import codecs
|
||||
from collections import OrderedDict
|
||||
from dataclasses import dataclass, field
|
||||
from io import BufferedIOBase, TextIOBase
|
||||
import re
|
||||
|
||||
@@ -10,18 +12,35 @@ class LogParseError(ValueError):
|
||||
"""Raised when the uploaded log file cannot be parsed."""
|
||||
|
||||
|
||||
def _decode_log_content(raw_bytes: bytes | str) -> str:
|
||||
"""Decode uploaded log content using practical text encodings seen in exports."""
|
||||
if isinstance(raw_bytes, str):
|
||||
return raw_bytes
|
||||
@dataclass(slots=True)
|
||||
class ParseSession:
|
||||
"""Stateful streamed parser for uploaded log files."""
|
||||
|
||||
for encoding in ("utf-8-sig", "cp1252", "latin-1"):
|
||||
try:
|
||||
return raw_bytes.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
stream: BufferedIOBase | TextIOBase
|
||||
encoding: str | None
|
||||
_union_keys: OrderedDict[str, None] = field(default_factory=OrderedDict)
|
||||
parsed_count: int = 0
|
||||
_consumed: bool = False
|
||||
|
||||
raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.")
|
||||
def iter_records(self):
|
||||
if self._consumed:
|
||||
raise RuntimeError("ParseSession records can only be consumed once.")
|
||||
|
||||
self._consumed = True
|
||||
for line_number, line in _iter_logical_records(_iter_physical_lines(self.stream, self.encoding)):
|
||||
record = _parse_record(line, line_number)
|
||||
for key in record:
|
||||
self._union_keys.setdefault(key, None)
|
||||
self.parsed_count += 1
|
||||
yield record
|
||||
|
||||
def union_keys(self) -> list[str]:
|
||||
return list(self._union_keys.keys())
|
||||
|
||||
|
||||
def create_parse_session(stream: BufferedIOBase | TextIOBase) -> ParseSession:
|
||||
"""Prepare a streamed parser session without materializing the full upload in memory."""
|
||||
return ParseSession(stream=stream, encoding=_resolve_stream_encoding(stream))
|
||||
|
||||
|
||||
def _normalize_value(value: str) -> str:
|
||||
@@ -34,6 +53,80 @@ def _normalize_value(value: str) -> str:
|
||||
return value
|
||||
|
||||
|
||||
def _resolve_stream_encoding(stream: BufferedIOBase | TextIOBase) -> str | None:
|
||||
"""Detect the most suitable stream encoding without reading the full file into memory."""
|
||||
probe = stream.read(0)
|
||||
if isinstance(probe, str):
|
||||
return None
|
||||
|
||||
for encoding in ("utf-8-sig", "cp1252", "latin-1"):
|
||||
try:
|
||||
_validate_stream_encoding(stream, encoding)
|
||||
return encoding
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
raise UnicodeDecodeError("unknown", b"", 0, 1, "Unsupported text encoding.")
|
||||
|
||||
|
||||
def _validate_stream_encoding(stream: BufferedIOBase | TextIOBase, encoding: str) -> None:
|
||||
"""Scan the stream to verify that the candidate encoding can decode it fully."""
|
||||
_rewind_stream(stream)
|
||||
decoder = codecs.getincrementaldecoder(encoding)()
|
||||
for chunk in iter(lambda: stream.read(64 * 1024), b""):
|
||||
decoder.decode(chunk, final=False)
|
||||
decoder.decode(b"", final=True)
|
||||
_rewind_stream(stream)
|
||||
|
||||
|
||||
def _iter_physical_lines(
|
||||
stream: BufferedIOBase | TextIOBase,
|
||||
encoding: str | None,
|
||||
):
|
||||
"""Yield decoded physical lines from the uploaded stream without full-file buffering."""
|
||||
_rewind_stream(stream)
|
||||
|
||||
if encoding is None:
|
||||
for line_number, raw_line in enumerate(stream, start=1):
|
||||
yield line_number, raw_line
|
||||
return
|
||||
|
||||
line_number = 1
|
||||
decoder = codecs.getincrementaldecoder(encoding)()
|
||||
pending = ""
|
||||
for chunk in iter(lambda: stream.read(64 * 1024), b""):
|
||||
text = decoder.decode(chunk, final=False)
|
||||
pending += text
|
||||
while True:
|
||||
newline_index = pending.find("\n")
|
||||
if newline_index == -1:
|
||||
break
|
||||
line = pending[: newline_index + 1]
|
||||
pending = pending[newline_index + 1 :]
|
||||
yield line_number, line
|
||||
line_number += 1
|
||||
|
||||
pending += decoder.decode(b"", final=True)
|
||||
while True:
|
||||
newline_index = pending.find("\n")
|
||||
if newline_index == -1:
|
||||
break
|
||||
line = pending[: newline_index + 1]
|
||||
pending = pending[newline_index + 1 :]
|
||||
yield line_number, line
|
||||
line_number += 1
|
||||
|
||||
if pending:
|
||||
yield line_number, pending
|
||||
|
||||
|
||||
def _rewind_stream(stream: BufferedIOBase | TextIOBase) -> None:
|
||||
"""Move the uploaded stream back to the start."""
|
||||
if not hasattr(stream, "seek"):
|
||||
raise LogParseError("The uploaded file stream is not seekable.")
|
||||
stream.seek(0)
|
||||
|
||||
|
||||
def _parse_record(line: str, line_number: int) -> dict[str, str]:
|
||||
"""Parse a logical record by locating `key=` boundaries instead of splitting on spaces."""
|
||||
matches = list(KEY_PATTERN.finditer(line))
|
||||
@@ -58,20 +151,19 @@ def _parse_record(line: str, line_number: int) -> dict[str, str]:
|
||||
return record
|
||||
|
||||
|
||||
def _iter_logical_records(content: str) -> list[tuple[int, str]]:
|
||||
def _iter_logical_records(physical_lines):
|
||||
"""Rebuild logical records when embedded newlines split a single log entry."""
|
||||
records: list[tuple[int, str]] = []
|
||||
current_record: list[str] = []
|
||||
current_start_line: int | None = None
|
||||
|
||||
for line_number, raw_line in enumerate(content.splitlines(), start=1):
|
||||
for line_number, raw_line in physical_lines:
|
||||
line = raw_line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith(RECORD_PREFIX):
|
||||
if current_record and current_start_line is not None:
|
||||
records.append((current_start_line, "".join(current_record)))
|
||||
yield current_start_line, "".join(current_record)
|
||||
current_record = [line]
|
||||
current_start_line = line_number
|
||||
continue
|
||||
@@ -85,24 +177,11 @@ def _iter_logical_records(content: str) -> list[tuple[int, str]]:
|
||||
)
|
||||
|
||||
if current_record and current_start_line is not None:
|
||||
records.append((current_start_line, "".join(current_record)))
|
||||
|
||||
return records
|
||||
yield current_start_line, "".join(current_record)
|
||||
|
||||
|
||||
def parse_log_file(stream: BufferedIOBase | TextIOBase) -> tuple[list[dict[str, str]], list[str]]:
|
||||
"""Parse a text log file where each line contains shell-like key/value tokens."""
|
||||
raw_bytes = stream.read()
|
||||
content = _decode_log_content(raw_bytes)
|
||||
|
||||
records: list[dict[str, str]] = []
|
||||
seen_keys: OrderedDict[str, None] = OrderedDict()
|
||||
|
||||
for line_number, line in _iter_logical_records(content):
|
||||
record = _parse_record(line, line_number)
|
||||
for key in record:
|
||||
seen_keys.setdefault(key, None)
|
||||
|
||||
records.append(record)
|
||||
|
||||
return records, list(seen_keys.keys())
|
||||
"""Compatibility helper that still materializes all parsed records when needed."""
|
||||
session = create_parse_session(stream)
|
||||
records = list(session.iter_records())
|
||||
return records, session.union_keys()
|
||||
|
||||
Reference in New Issue
Block a user