Reduce conversion memory footprint

2026-04-27 11:44:40 +02:00
parent 9313b54abb
commit f9f792f6a1
10 changed files with 324 additions and 102 deletions
--- a/app/services/exporter.py
+++ b/app/services/exporter.py
@@ -1,69 +1,107 @@
 import csv
 import io
 from dataclasses import dataclass
+from pathlib import Path
+from typing import Sequence, TextIO

 from app.constants import VENDOR_FIELDS


@dataclass(slots=True)
 class ExportResult:
-    content: str
    columns: list[str]
    output_format: str
+    preview_text: str

-    def preview(self, record_limit: int) -> str:
-        """Build a small preview string for the result page."""
-        if self.output_format == "text":
-            marker = f"--- record {record_limit + 1} ---"
-            if marker in self.content:
-                return self.content.split(marker, 1)[0].rstrip()
-            return self.content
-
-        lines = self.content.splitlines()
-        if len(lines) <= record_limit + 1:
-            return self.content
-        return "\n".join(lines[: record_limit + 1])
+    def preview(self, _record_limit: int) -> str:
+        """Return the preview that was collected during export writing."""
+        return self.preview_text


-def build_export(
-    records: list[dict[str, str]],
+def write_export(
+    file_path: Path,
+    records: Sequence[dict[str, str]],
    union_keys: list[str],
    mode: str,
    output_format: str,
+    preview_record_limit: int,
 ) -> ExportResult:
+    """Write the final export directly to disk and keep only a small preview in memory."""
    columns = VENDOR_FIELDS if mode == "vendor" else union_keys

-    if output_format == "text":
-        return ExportResult(
-            content=_render_text(records, columns),
-            columns=columns,
-            output_format=output_format,
-        )
+    with file_path.open("w", encoding="utf-8", newline="") as export_file:
+        if output_format == "text":
+            preview_text = _write_text(
+                export_file=export_file,
+                records=records,
+                columns=columns,
+                preview_record_limit=preview_record_limit,
+            )
+        else:
+            preview_text = _write_csv(
+                export_file=export_file,
+                records=records,
+                columns=columns,
+                preview_record_limit=preview_record_limit,
+            )

    return ExportResult(
-        content=_render_csv(records, columns),
        columns=columns,
        output_format=output_format,
+        preview_text=preview_text,
    )


-def _render_text(records: list[dict[str, str]], columns: list[str]) -> str:
+def _write_text(
+    export_file: TextIO,
+    records: Sequence[dict[str, str]],
+    columns: list[str],
+    preview_record_limit: int,
+) -> str:
    max_key_length = max((len(column) for column in columns), default=0)
-    chunks: list[str] = []
+    preview_lines: list[str] = []
+    wrote_line = False

    for index, record in enumerate(records, start=1):
-        chunks.append(f"--- record {index} ---")
+        header = f"--- record {index} ---"
+        wrote_line = _write_line(export_file, header, wrote_line)
+        if index <= preview_record_limit:
+            preview_lines.append(header)
+
        for column in columns:
-            value = record.get(column, "")
-            chunks.append(f"  {column.ljust(max_key_length)} = {value}")
+            line = f"  {column.ljust(max_key_length)} = {record.get(column, '')}"
+            wrote_line = _write_line(export_file, line, wrote_line)
+            if index <= preview_record_limit:
+                preview_lines.append(line)

-    return "\n".join(chunks)
+    return "\n".join(preview_lines)


-def _render_csv(records: list[dict[str, str]], columns: list[str]) -> str:
-    buffer = io.StringIO()
-    writer = csv.DictWriter(buffer, fieldnames=columns, extrasaction="ignore")
+def _write_csv(
+    export_file: TextIO,
+    records: Sequence[dict[str, str]],
+    columns: list[str],
+    preview_record_limit: int,
+) -> str:
+    writer = csv.DictWriter(export_file, fieldnames=columns, extrasaction="ignore")
    writer.writeheader()
-    for record in records:
-        writer.writerow({column: record.get(column, "") for column in columns})
-    return buffer.getvalue()
+
+    preview_buffer = io.StringIO()
+    preview_writer = csv.DictWriter(preview_buffer, fieldnames=columns, extrasaction="ignore")
+    preview_writer.writeheader()
+
+    for index, record in enumerate(records, start=1):
+        row = {column: record.get(column, "") for column in columns}
+        writer.writerow(row)
+        if index <= preview_record_limit:
+            preview_writer.writerow(row)
+
+    return preview_buffer.getvalue().rstrip("\n")
+
+
+def _write_line(export_file: TextIO, line: str, wrote_line: bool) -> bool:
+    """Write lines without leaving a trailing newline at the end of the file."""
+    if wrote_line:
+        export_file.write("\n")
+    export_file.write(line)
+    return True