Improve ingestion concurrency safety and batch transaction robustness

2026-03-10 16:37:29 +01:00
parent abd3419aac
commit 2252821daf
5 changed files with 202 additions and 44 deletions
--- a/apps/ingestion/services/runs.py
+++ b/apps/ingestion/services/runs.py
@ -1,6 +1,7 @@
-from datetime import timedelta
+import hashlib
+from contextlib import contextmanager

-from django.db.models import Q
+from django.db import connection
 from django.utils import timezone

 from apps.ingestion.models import IngestionError, IngestionRun
@ -62,14 +63,59 @@ def mark_ingestion_run_skipped(*, provider_namespace: str, job_type: str, reason
    return run


-def has_running_ingestion_run(*, provider_namespace: str, job_type: str, within_minutes: int) -> bool:
-    cutoff = timezone.now() - timedelta(minutes=max(within_minutes, 1))
-    return IngestionRun.objects.filter(
+def _build_ingestion_lock_key(*, provider_namespace: str, job_type: str) -> int:
+    digest = hashlib.blake2b(
+        f"{provider_namespace}:{job_type}".encode("utf-8"),
+        digest_size=8,
+    ).digest()
+    return int.from_bytes(digest, byteorder="big", signed=True)
+
+
+def try_acquire_ingestion_lock(*, provider_namespace: str, job_type: str) -> tuple[bool, int]:
+    lock_key = _build_ingestion_lock_key(provider_namespace=provider_namespace, job_type=job_type)
+    with connection.cursor() as cursor:
+        cursor.execute("SELECT pg_try_advisory_lock(%s);", [lock_key])
+        acquired = bool(cursor.fetchone()[0])
+    return acquired, lock_key
+
+
+def release_ingestion_lock(*, lock_key: int) -> None:
+    with connection.cursor() as cursor:
+        cursor.execute("SELECT pg_advisory_unlock(%s);", [lock_key])
+
+
+@contextmanager
+def ingestion_advisory_lock(*, provider_namespace: str, job_type: str):
+    acquired, lock_key = try_acquire_ingestion_lock(
        provider_namespace=provider_namespace,
        job_type=job_type,
-        status=IngestionRun.RunStatus.RUNNING,
-        started_at__gte=cutoff,
-    ).filter(Q(finished_at__isnull=True) | Q(finished_at__gte=cutoff)).exists()
+    )
+    try:
+        yield acquired
+    finally:
+        if acquired:
+            release_ingestion_lock(lock_key=lock_key)
+
+
+def update_ingestion_run_progress(
+    *,
+    run: IngestionRun,
+    completed_step: str,
+    step_summary: dict,
+    source_counts: dict | None = None,
+) -> IngestionRun:
+    context = dict(run.context or {})
+    completed_steps = list(context.get("completed_steps") or [])
+    completed_steps.append(completed_step)
+    context["completed_steps"] = completed_steps
+    step_summaries = dict(context.get("step_summaries") or {})
+    step_summaries[completed_step] = step_summary
+    context["step_summaries"] = step_summaries
+    if source_counts is not None:
+        context["source_counts"] = source_counts
+    run.context = context
+    run.save(update_fields=["context"])
+    return run


 def log_ingestion_error(*, run: IngestionRun, message: str, provider_namespace: str, severity: str = IngestionError.Severity.ERROR, entity_type: str = "", external_id: str = "", raw_payload: dict | None = None) -> IngestionError: