Wire Celery Beat periodic sync with ingestion run tracking
This commit is contained in:
@ -55,6 +55,12 @@ PROVIDER_BALLDONTLIE_STATS_PAGE_LIMIT=10
|
|||||||
PROVIDER_BALLDONTLIE_STATS_PER_PAGE=100
|
PROVIDER_BALLDONTLIE_STATS_PER_PAGE=100
|
||||||
CELERY_TASK_TIME_LIMIT=1800
|
CELERY_TASK_TIME_LIMIT=1800
|
||||||
CELERY_TASK_SOFT_TIME_LIMIT=1500
|
CELERY_TASK_SOFT_TIME_LIMIT=1500
|
||||||
|
INGESTION_SCHEDULE_ENABLED=0
|
||||||
|
INGESTION_SCHEDULE_CRON=*/30 * * * *
|
||||||
|
INGESTION_SCHEDULE_PROVIDER_NAMESPACE=
|
||||||
|
INGESTION_SCHEDULE_JOB_TYPE=incremental
|
||||||
|
INGESTION_PREVENT_OVERLAP=1
|
||||||
|
INGESTION_OVERLAP_WINDOW_MINUTES=180
|
||||||
API_THROTTLE_ANON=100/hour
|
API_THROTTLE_ANON=100/hour
|
||||||
API_THROTTLE_USER=1000/hour
|
API_THROTTLE_USER=1000/hour
|
||||||
|
|
||||||
|
|||||||
17
README.md
17
README.md
@ -97,7 +97,7 @@ docker compose exec web python manage.py createsuperuser
|
|||||||
- `web` service also builds Tailwind CSS before `collectstatic` when `AUTO_BUILD_TAILWIND=1`.
|
- `web` service also builds Tailwind CSS before `collectstatic` when `AUTO_BUILD_TAILWIND=1`.
|
||||||
- `web`, `celery_worker`, `celery_beat`, and `tailwind` run as a non-root user inside the image.
|
- `web`, `celery_worker`, `celery_beat`, and `tailwind` run as a non-root user inside the image.
|
||||||
- `celery_worker` executes background sync work.
|
- `celery_worker` executes background sync work.
|
||||||
- `celery_beat` supports scheduled jobs (future scheduling strategy can be added per provider).
|
- `celery_beat` triggers periodic provider sync (`apps.ingestion.tasks.scheduled_provider_sync`).
|
||||||
- `tailwind` service runs watch mode for development (`npm run dev`).
|
- `tailwind` service runs watch mode for development (`npm run dev`).
|
||||||
- nginx proxies web traffic and serves static/media volume mounts.
|
- nginx proxies web traffic and serves static/media volume mounts.
|
||||||
|
|
||||||
@ -222,6 +222,21 @@ trigger_full_sync.delay(provider_namespace="balldontlie")
|
|||||||
- Run-level status/counters: `IngestionRun`
|
- Run-level status/counters: `IngestionRun`
|
||||||
- Structured error records: `IngestionError`
|
- Structured error records: `IngestionError`
|
||||||
- Provider entity mappings + diagnostic payload snippets: `ExternalMapping`
|
- Provider entity mappings + diagnostic payload snippets: `ExternalMapping`
|
||||||
|
- `IngestionRun.error_summary` captures top-level failure/partial-failure context
|
||||||
|
|
||||||
|
### Scheduled sync via Celery Beat
|
||||||
|
|
||||||
|
Configure scheduled sync through environment variables:
|
||||||
|
|
||||||
|
- `INGESTION_SCHEDULE_ENABLED` (`0`/`1`)
|
||||||
|
- `INGESTION_SCHEDULE_CRON` (5-field cron expression, default `*/30 * * * *`)
|
||||||
|
- `INGESTION_SCHEDULE_PROVIDER_NAMESPACE` (optional; falls back to default provider namespace)
|
||||||
|
- `INGESTION_SCHEDULE_JOB_TYPE` (`incremental` or `full_sync`)
|
||||||
|
- `INGESTION_PREVENT_OVERLAP` (`0`/`1`) to skip obvious overlapping runs
|
||||||
|
- `INGESTION_OVERLAP_WINDOW_MINUTES` overlap guard window
|
||||||
|
|
||||||
|
When enabled, Celery Beat enqueues the scheduled sync task on the configured cron.
|
||||||
|
The task uses the existing ingestion service path and writes run/error records in the same tables as manual sync.
|
||||||
|
|
||||||
## Provider Backend Selection
|
## Provider Backend Selection
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.contrib import messages
|
from django.contrib import messages
|
||||||
|
from django.db.models import Count
|
||||||
|
|
||||||
from apps.providers.registry import get_default_provider_namespace
|
from apps.providers.registry import get_default_provider_namespace
|
||||||
|
|
||||||
@ -20,7 +21,11 @@ class IngestionRunAdmin(admin.ModelAdmin):
|
|||||||
"job_type",
|
"job_type",
|
||||||
"status",
|
"status",
|
||||||
"records_processed",
|
"records_processed",
|
||||||
|
"records_created",
|
||||||
|
"records_updated",
|
||||||
"records_failed",
|
"records_failed",
|
||||||
|
"error_count",
|
||||||
|
"short_error_summary",
|
||||||
"started_at",
|
"started_at",
|
||||||
"finished_at",
|
"finished_at",
|
||||||
)
|
)
|
||||||
@ -38,6 +43,7 @@ class IngestionRunAdmin(admin.ModelAdmin):
|
|||||||
"records_created",
|
"records_created",
|
||||||
"records_updated",
|
"records_updated",
|
||||||
"records_failed",
|
"records_failed",
|
||||||
|
"error_summary",
|
||||||
"context",
|
"context",
|
||||||
"raw_payload",
|
"raw_payload",
|
||||||
"created_at",
|
"created_at",
|
||||||
@ -79,6 +85,20 @@ class IngestionRunAdmin(admin.ModelAdmin):
|
|||||||
count += 1
|
count += 1
|
||||||
self.message_user(request, f"Queued {count} retry task(s).", level=messages.SUCCESS)
|
self.message_user(request, f"Queued {count} retry task(s).", level=messages.SUCCESS)
|
||||||
|
|
||||||
|
def get_queryset(self, request):
|
||||||
|
queryset = super().get_queryset(request)
|
||||||
|
return queryset.annotate(_error_count=Count("errors"))
|
||||||
|
|
||||||
|
@admin.display(ordering="_error_count", description="Errors")
|
||||||
|
def error_count(self, obj):
|
||||||
|
return getattr(obj, "_error_count", 0)
|
||||||
|
|
||||||
|
@admin.display(description="Error summary")
|
||||||
|
def short_error_summary(self, obj):
|
||||||
|
if not obj.error_summary:
|
||||||
|
return "-"
|
||||||
|
return (obj.error_summary[:90] + "...") if len(obj.error_summary) > 90 else obj.error_summary
|
||||||
|
|
||||||
|
|
||||||
@admin.register(IngestionError)
|
@admin.register(IngestionError)
|
||||||
class IngestionErrorAdmin(admin.ModelAdmin):
|
class IngestionErrorAdmin(admin.ModelAdmin):
|
||||||
|
|||||||
18
apps/ingestion/migrations/0002_ingestionrun_error_summary.py
Normal file
18
apps/ingestion/migrations/0002_ingestionrun_error_summary.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 5.2.12 on 2026-03-10 16:12
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
("ingestion", "0001_initial"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="ingestionrun",
|
||||||
|
name="error_summary",
|
||||||
|
field=models.TextField(blank=True, default=""),
|
||||||
|
),
|
||||||
|
]
|
||||||
@ -31,6 +31,7 @@ class IngestionRun(models.Model):
|
|||||||
records_created = models.PositiveIntegerField(default=0)
|
records_created = models.PositiveIntegerField(default=0)
|
||||||
records_updated = models.PositiveIntegerField(default=0)
|
records_updated = models.PositiveIntegerField(default=0)
|
||||||
records_failed = models.PositiveIntegerField(default=0)
|
records_failed = models.PositiveIntegerField(default=0)
|
||||||
|
error_summary = models.TextField(blank=True, default="")
|
||||||
context = models.JSONField(default=dict, blank=True)
|
context = models.JSONField(default=dict, blank=True)
|
||||||
raw_payload = models.JSONField(default=dict, blank=True)
|
raw_payload = models.JSONField(default=dict, blank=True)
|
||||||
created_at = models.DateTimeField(auto_now_add=True)
|
created_at = models.DateTimeField(auto_now_add=True)
|
||||||
|
|||||||
@ -1,3 +1,6 @@
|
|||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
from django.db.models import Q
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from apps.ingestion.models import IngestionError, IngestionRun
|
from apps.ingestion.models import IngestionError, IngestionRun
|
||||||
@ -14,12 +17,22 @@ def start_ingestion_run(*, provider_namespace: str, job_type: str, triggered_by=
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def finish_ingestion_run(*, run: IngestionRun, status: str, processed: int = 0, created: int = 0, updated: int = 0, failed: int = 0) -> IngestionRun:
|
def finish_ingestion_run(
|
||||||
|
*,
|
||||||
|
run: IngestionRun,
|
||||||
|
status: str,
|
||||||
|
processed: int = 0,
|
||||||
|
created: int = 0,
|
||||||
|
updated: int = 0,
|
||||||
|
failed: int = 0,
|
||||||
|
error_summary: str = "",
|
||||||
|
) -> IngestionRun:
|
||||||
run.status = status
|
run.status = status
|
||||||
run.records_processed = processed
|
run.records_processed = processed
|
||||||
run.records_created = created
|
run.records_created = created
|
||||||
run.records_updated = updated
|
run.records_updated = updated
|
||||||
run.records_failed = failed
|
run.records_failed = failed
|
||||||
|
run.error_summary = error_summary
|
||||||
run.finished_at = timezone.now()
|
run.finished_at = timezone.now()
|
||||||
run.save(
|
run.save(
|
||||||
update_fields=[
|
update_fields=[
|
||||||
@ -28,12 +41,37 @@ def finish_ingestion_run(*, run: IngestionRun, status: str, processed: int = 0,
|
|||||||
"records_created",
|
"records_created",
|
||||||
"records_updated",
|
"records_updated",
|
||||||
"records_failed",
|
"records_failed",
|
||||||
|
"error_summary",
|
||||||
"finished_at",
|
"finished_at",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
return run
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
def mark_ingestion_run_skipped(*, provider_namespace: str, job_type: str, reason: str, context: dict | None = None) -> IngestionRun:
|
||||||
|
now = timezone.now()
|
||||||
|
run = IngestionRun.objects.create(
|
||||||
|
provider_namespace=provider_namespace,
|
||||||
|
job_type=job_type,
|
||||||
|
status=IngestionRun.RunStatus.CANCELED,
|
||||||
|
started_at=now,
|
||||||
|
finished_at=now,
|
||||||
|
error_summary=reason,
|
||||||
|
context=context or {},
|
||||||
|
)
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
def has_running_ingestion_run(*, provider_namespace: str, job_type: str, within_minutes: int) -> bool:
|
||||||
|
cutoff = timezone.now() - timedelta(minutes=max(within_minutes, 1))
|
||||||
|
return IngestionRun.objects.filter(
|
||||||
|
provider_namespace=provider_namespace,
|
||||||
|
job_type=job_type,
|
||||||
|
status=IngestionRun.RunStatus.RUNNING,
|
||||||
|
started_at__gte=cutoff,
|
||||||
|
).filter(Q(finished_at__isnull=True) | Q(finished_at__gte=cutoff)).exists()
|
||||||
|
|
||||||
|
|
||||||
def log_ingestion_error(*, run: IngestionRun, message: str, provider_namespace: str, severity: str = IngestionError.Severity.ERROR, entity_type: str = "", external_id: str = "", raw_payload: dict | None = None) -> IngestionError:
|
def log_ingestion_error(*, run: IngestionRun, message: str, provider_namespace: str, severity: str = IngestionError.Severity.ERROR, entity_type: str = "", external_id: str = "", raw_payload: dict | None = None) -> IngestionError:
|
||||||
return IngestionError.objects.create(
|
return IngestionError.objects.create(
|
||||||
ingestion_run=run,
|
ingestion_run=run,
|
||||||
|
|||||||
@ -427,6 +427,12 @@ def run_sync_job(
|
|||||||
context=context or {},
|
context=context or {},
|
||||||
)
|
)
|
||||||
summary = SyncSummary()
|
summary = SyncSummary()
|
||||||
|
logger.info(
|
||||||
|
"Starting ingestion run id=%s provider=%s job_type=%s",
|
||||||
|
run.id,
|
||||||
|
provider_namespace,
|
||||||
|
job_type,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
provider = get_provider(provider_namespace)
|
provider = get_provider(provider_namespace)
|
||||||
@ -444,6 +450,9 @@ def run_sync_job(
|
|||||||
_sync_player_stats(provider_namespace, payload.get("player_stats", []), run, summary)
|
_sync_player_stats(provider_namespace, payload.get("player_stats", []), run, summary)
|
||||||
_sync_player_careers(provider_namespace, payload.get("player_careers", []), run, summary)
|
_sync_player_careers(provider_namespace, payload.get("player_careers", []), run, summary)
|
||||||
|
|
||||||
|
success_error_summary = ""
|
||||||
|
if summary.failed > 0:
|
||||||
|
success_error_summary = f"Completed with {summary.failed} failed record(s)."
|
||||||
finish_ingestion_run(
|
finish_ingestion_run(
|
||||||
run=run,
|
run=run,
|
||||||
status=IngestionRun.RunStatus.SUCCESS,
|
status=IngestionRun.RunStatus.SUCCESS,
|
||||||
@ -451,6 +460,16 @@ def run_sync_job(
|
|||||||
created=summary.created,
|
created=summary.created,
|
||||||
updated=summary.updated,
|
updated=summary.updated,
|
||||||
failed=summary.failed,
|
failed=summary.failed,
|
||||||
|
error_summary=success_error_summary,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Completed ingestion run id=%s status=%s processed=%s created=%s updated=%s failed=%s",
|
||||||
|
run.id,
|
||||||
|
IngestionRun.RunStatus.SUCCESS,
|
||||||
|
summary.processed,
|
||||||
|
summary.created,
|
||||||
|
summary.updated,
|
||||||
|
summary.failed,
|
||||||
)
|
)
|
||||||
return run
|
return run
|
||||||
|
|
||||||
@ -471,6 +490,7 @@ def run_sync_job(
|
|||||||
created=summary.created,
|
created=summary.created,
|
||||||
updated=summary.updated,
|
updated=summary.updated,
|
||||||
failed=summary.failed + 1,
|
failed=summary.failed + 1,
|
||||||
|
error_summary=f"Rate limit from provider: {exc}",
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@ -490,6 +510,7 @@ def run_sync_job(
|
|||||||
created=summary.created,
|
created=summary.created,
|
||||||
updated=summary.updated,
|
updated=summary.updated,
|
||||||
failed=summary.failed + 1,
|
failed=summary.failed + 1,
|
||||||
|
error_summary=f"Transient provider error: {exc}",
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@ -509,5 +530,6 @@ def run_sync_job(
|
|||||||
created=summary.created,
|
created=summary.created,
|
||||||
updated=summary.updated,
|
updated=summary.updated,
|
||||||
failed=summary.failed + 1,
|
failed=summary.failed + 1,
|
||||||
|
error_summary=f"Unhandled ingestion error: {exc}",
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
|||||||
@ -1,8 +1,51 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
from apps.ingestion.models import IngestionRun
|
from apps.ingestion.models import IngestionRun
|
||||||
|
from apps.ingestion.services.runs import has_running_ingestion_run, mark_ingestion_run_skipped
|
||||||
from apps.ingestion.services.sync import run_sync_job
|
from apps.ingestion.services.sync import run_sync_job
|
||||||
from apps.providers.exceptions import ProviderRateLimitError, ProviderTransientError
|
from apps.providers.exceptions import ProviderRateLimitError, ProviderTransientError
|
||||||
|
from apps.providers.registry import get_default_provider_namespace
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_sync_with_overlap_guard(
|
||||||
|
*,
|
||||||
|
provider_namespace: str,
|
||||||
|
job_type: str,
|
||||||
|
triggered_by_id: int | None = None,
|
||||||
|
context: dict | None = None,
|
||||||
|
cursor: str | None = None,
|
||||||
|
):
|
||||||
|
effective_context = context or {}
|
||||||
|
if settings.INGESTION_PREVENT_OVERLAP and has_running_ingestion_run(
|
||||||
|
provider_namespace=provider_namespace,
|
||||||
|
job_type=job_type,
|
||||||
|
within_minutes=settings.INGESTION_OVERLAP_WINDOW_MINUTES,
|
||||||
|
):
|
||||||
|
reason = (
|
||||||
|
f"Skipped due to overlapping running job for provider={provider_namespace}, "
|
||||||
|
f"job_type={job_type}."
|
||||||
|
)
|
||||||
|
logger.warning(reason)
|
||||||
|
run = mark_ingestion_run_skipped(
|
||||||
|
provider_namespace=provider_namespace,
|
||||||
|
job_type=job_type,
|
||||||
|
reason=reason,
|
||||||
|
context=effective_context,
|
||||||
|
)
|
||||||
|
return run.id
|
||||||
|
|
||||||
|
return run_sync_job(
|
||||||
|
provider_namespace=provider_namespace,
|
||||||
|
job_type=job_type,
|
||||||
|
triggered_by_id=triggered_by_id,
|
||||||
|
context=effective_context,
|
||||||
|
cursor=cursor,
|
||||||
|
).id
|
||||||
|
|
||||||
|
|
||||||
@shared_task(
|
@shared_task(
|
||||||
@ -13,12 +56,12 @@ from apps.providers.exceptions import ProviderRateLimitError, ProviderTransientE
|
|||||||
retry_kwargs={"max_retries": 5},
|
retry_kwargs={"max_retries": 5},
|
||||||
)
|
)
|
||||||
def trigger_full_sync(self, provider_namespace: str, triggered_by_id: int | None = None, context: dict | None = None):
|
def trigger_full_sync(self, provider_namespace: str, triggered_by_id: int | None = None, context: dict | None = None):
|
||||||
return run_sync_job(
|
return _run_sync_with_overlap_guard(
|
||||||
provider_namespace=provider_namespace,
|
provider_namespace=provider_namespace,
|
||||||
job_type=IngestionRun.JobType.FULL_SYNC,
|
job_type=IngestionRun.JobType.FULL_SYNC,
|
||||||
triggered_by_id=triggered_by_id,
|
triggered_by_id=triggered_by_id,
|
||||||
context=context or {},
|
context=context or {},
|
||||||
).id
|
)
|
||||||
|
|
||||||
|
|
||||||
@shared_task(
|
@shared_task(
|
||||||
@ -29,10 +72,34 @@ def trigger_full_sync(self, provider_namespace: str, triggered_by_id: int | None
|
|||||||
retry_kwargs={"max_retries": 5},
|
retry_kwargs={"max_retries": 5},
|
||||||
)
|
)
|
||||||
def trigger_incremental_sync(self, provider_namespace: str, cursor: str | None = None, triggered_by_id: int | None = None, context: dict | None = None):
|
def trigger_incremental_sync(self, provider_namespace: str, cursor: str | None = None, triggered_by_id: int | None = None, context: dict | None = None):
|
||||||
return run_sync_job(
|
return _run_sync_with_overlap_guard(
|
||||||
provider_namespace=provider_namespace,
|
provider_namespace=provider_namespace,
|
||||||
job_type=IngestionRun.JobType.INCREMENTAL,
|
job_type=IngestionRun.JobType.INCREMENTAL,
|
||||||
triggered_by_id=triggered_by_id,
|
triggered_by_id=triggered_by_id,
|
||||||
context=context or {},
|
context=context or {},
|
||||||
cursor=cursor,
|
cursor=cursor,
|
||||||
).id
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(
|
||||||
|
bind=True,
|
||||||
|
name="apps.ingestion.tasks.scheduled_provider_sync",
|
||||||
|
autoretry_for=(ProviderRateLimitError, ProviderTransientError),
|
||||||
|
retry_backoff=True,
|
||||||
|
retry_jitter=True,
|
||||||
|
retry_kwargs={"max_retries": 5},
|
||||||
|
)
|
||||||
|
def scheduled_provider_sync(self):
|
||||||
|
provider_namespace = settings.INGESTION_SCHEDULE_PROVIDER_NAMESPACE or get_default_provider_namespace()
|
||||||
|
context = {"trigger": "celery_beat", "task_id": self.request.id}
|
||||||
|
if settings.INGESTION_SCHEDULE_JOB_TYPE == IngestionRun.JobType.FULL_SYNC:
|
||||||
|
return _run_sync_with_overlap_guard(
|
||||||
|
provider_namespace=provider_namespace,
|
||||||
|
job_type=IngestionRun.JobType.FULL_SYNC,
|
||||||
|
context=context,
|
||||||
|
)
|
||||||
|
return _run_sync_with_overlap_guard(
|
||||||
|
provider_namespace=provider_namespace,
|
||||||
|
job_type=IngestionRun.JobType.INCREMENTAL,
|
||||||
|
context=context,
|
||||||
|
)
|
||||||
|
|||||||
@ -1,8 +1,41 @@
|
|||||||
import os
|
import os
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
|
from celery.schedules import crontab
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.development")
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.development")
|
||||||
|
|
||||||
app = Celery("hoopscout")
|
app = Celery("hoopscout")
|
||||||
app.config_from_object("django.conf:settings", namespace="CELERY")
|
app.config_from_object("django.conf:settings", namespace="CELERY")
|
||||||
app.autodiscover_tasks()
|
app.autodiscover_tasks()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_cron_expression(expression: str) -> dict[str, str]:
|
||||||
|
parts = expression.split()
|
||||||
|
if len(parts) != 5:
|
||||||
|
raise ValueError(
|
||||||
|
"INGESTION_SCHEDULE_CRON must have 5 fields: minute hour day_of_month month_of_year day_of_week."
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"minute": parts[0],
|
||||||
|
"hour": parts[1],
|
||||||
|
"day_of_month": parts[2],
|
||||||
|
"month_of_year": parts[3],
|
||||||
|
"day_of_week": parts[4],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_periodic_schedule() -> dict:
|
||||||
|
if not settings.INGESTION_SCHEDULE_ENABLED:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
schedule_kwargs = _parse_cron_expression(settings.INGESTION_SCHEDULE_CRON)
|
||||||
|
return {
|
||||||
|
"ingestion.scheduled_provider_sync": {
|
||||||
|
"task": "apps.ingestion.tasks.scheduled_provider_sync",
|
||||||
|
"schedule": crontab(**schedule_kwargs),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
app.conf.beat_schedule = build_periodic_schedule()
|
||||||
|
|||||||
@ -124,6 +124,15 @@ CELERY_RESULT_SERIALIZER = "json"
|
|||||||
CELERY_TIMEZONE = TIME_ZONE
|
CELERY_TIMEZONE = TIME_ZONE
|
||||||
CELERY_TASK_TIME_LIMIT = int(os.getenv("CELERY_TASK_TIME_LIMIT", "1800"))
|
CELERY_TASK_TIME_LIMIT = int(os.getenv("CELERY_TASK_TIME_LIMIT", "1800"))
|
||||||
CELERY_TASK_SOFT_TIME_LIMIT = int(os.getenv("CELERY_TASK_SOFT_TIME_LIMIT", "1500"))
|
CELERY_TASK_SOFT_TIME_LIMIT = int(os.getenv("CELERY_TASK_SOFT_TIME_LIMIT", "1500"))
|
||||||
|
INGESTION_SCHEDULE_ENABLED = env_bool("INGESTION_SCHEDULE_ENABLED", False)
|
||||||
|
INGESTION_SCHEDULE_CRON = os.getenv("INGESTION_SCHEDULE_CRON", "*/30 * * * *").strip()
|
||||||
|
INGESTION_SCHEDULE_PROVIDER_NAMESPACE = os.getenv("INGESTION_SCHEDULE_PROVIDER_NAMESPACE", "").strip()
|
||||||
|
INGESTION_SCHEDULE_JOB_TYPE = os.getenv("INGESTION_SCHEDULE_JOB_TYPE", "incremental").strip().lower()
|
||||||
|
INGESTION_PREVENT_OVERLAP = env_bool("INGESTION_PREVENT_OVERLAP", True)
|
||||||
|
INGESTION_OVERLAP_WINDOW_MINUTES = int(os.getenv("INGESTION_OVERLAP_WINDOW_MINUTES", "180"))
|
||||||
|
|
||||||
|
if INGESTION_SCHEDULE_JOB_TYPE not in {"incremental", "full_sync"}:
|
||||||
|
raise ImproperlyConfigured("INGESTION_SCHEDULE_JOB_TYPE must be either 'incremental' or 'full_sync'.")
|
||||||
|
|
||||||
PROVIDER_BACKEND = os.getenv("PROVIDER_BACKEND", "demo").strip().lower()
|
PROVIDER_BACKEND = os.getenv("PROVIDER_BACKEND", "demo").strip().lower()
|
||||||
PROVIDER_NAMESPACE_DEMO = os.getenv("PROVIDER_NAMESPACE_DEMO", "mvp_demo")
|
PROVIDER_NAMESPACE_DEMO = os.getenv("PROVIDER_NAMESPACE_DEMO", "mvp_demo")
|
||||||
|
|||||||
@ -67,6 +67,10 @@ def test_incremental_sync_runs_successfully(settings):
|
|||||||
|
|
||||||
assert run.status == IngestionRun.RunStatus.SUCCESS
|
assert run.status == IngestionRun.RunStatus.SUCCESS
|
||||||
assert run.records_processed > 0
|
assert run.records_processed > 0
|
||||||
|
assert run.started_at is not None
|
||||||
|
assert run.finished_at is not None
|
||||||
|
assert run.finished_at >= run.started_at
|
||||||
|
assert run.error_summary == ""
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
@ -80,6 +84,9 @@ def test_run_sync_handles_rate_limit(settings):
|
|||||||
run = IngestionRun.objects.order_by("-id").first()
|
run = IngestionRun.objects.order_by("-id").first()
|
||||||
assert run is not None
|
assert run is not None
|
||||||
assert run.status == IngestionRun.RunStatus.FAILED
|
assert run.status == IngestionRun.RunStatus.FAILED
|
||||||
|
assert run.started_at is not None
|
||||||
|
assert run.finished_at is not None
|
||||||
|
assert "Rate limit" in run.error_summary
|
||||||
assert IngestionError.objects.filter(ingestion_run=run).exists()
|
assert IngestionError.objects.filter(ingestion_run=run).exists()
|
||||||
|
|
||||||
os.environ.pop("PROVIDER_MVP_FORCE_RATE_LIMIT", None)
|
os.environ.pop("PROVIDER_MVP_FORCE_RATE_LIMIT", None)
|
||||||
|
|||||||
69
tests/test_ingestion_tasks.py
Normal file
69
tests/test_ingestion_tasks.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
import pytest
|
||||||
|
from celery.schedules import crontab
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from apps.ingestion.models import IngestionRun
|
||||||
|
from apps.ingestion.tasks import scheduled_provider_sync, trigger_incremental_sync
|
||||||
|
from config.celery import app as celery_app, build_periodic_schedule
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_periodic_task_registered():
|
||||||
|
assert "apps.ingestion.tasks.scheduled_provider_sync" in celery_app.tasks
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_build_periodic_schedule_enabled(settings):
|
||||||
|
settings.INGESTION_SCHEDULE_ENABLED = True
|
||||||
|
settings.INGESTION_SCHEDULE_CRON = "15 * * * *"
|
||||||
|
|
||||||
|
schedule = build_periodic_schedule()
|
||||||
|
assert "ingestion.scheduled_provider_sync" in schedule
|
||||||
|
entry = schedule["ingestion.scheduled_provider_sync"]
|
||||||
|
assert entry["task"] == "apps.ingestion.tasks.scheduled_provider_sync"
|
||||||
|
assert isinstance(entry["schedule"], crontab)
|
||||||
|
assert entry["schedule"]._orig_minute == "15"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_build_periodic_schedule_disabled(settings):
|
||||||
|
settings.INGESTION_SCHEDULE_ENABLED = False
|
||||||
|
assert build_periodic_schedule() == {}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_trigger_incremental_sync_skips_overlapping_runs(settings):
|
||||||
|
settings.INGESTION_PREVENT_OVERLAP = True
|
||||||
|
settings.INGESTION_OVERLAP_WINDOW_MINUTES = 180
|
||||||
|
|
||||||
|
IngestionRun.objects.create(
|
||||||
|
provider_namespace="mvp_demo",
|
||||||
|
job_type=IngestionRun.JobType.INCREMENTAL,
|
||||||
|
status=IngestionRun.RunStatus.RUNNING,
|
||||||
|
started_at=timezone.now(),
|
||||||
|
)
|
||||||
|
|
||||||
|
run_id = trigger_incremental_sync.apply(
|
||||||
|
kwargs={"provider_namespace": "mvp_demo"},
|
||||||
|
).get()
|
||||||
|
skipped_run = IngestionRun.objects.get(id=run_id)
|
||||||
|
assert skipped_run.status == IngestionRun.RunStatus.CANCELED
|
||||||
|
assert "overlapping running job" in skipped_run.error_summary
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_scheduled_provider_sync_uses_configured_job_type(settings, monkeypatch):
|
||||||
|
settings.INGESTION_SCHEDULE_JOB_TYPE = IngestionRun.JobType.FULL_SYNC
|
||||||
|
settings.INGESTION_SCHEDULE_PROVIDER_NAMESPACE = "mvp_demo"
|
||||||
|
captured = {}
|
||||||
|
|
||||||
|
def fake_runner(**kwargs):
|
||||||
|
captured.update(kwargs)
|
||||||
|
return 99
|
||||||
|
|
||||||
|
monkeypatch.setattr("apps.ingestion.tasks._run_sync_with_overlap_guard", fake_runner)
|
||||||
|
|
||||||
|
result = scheduled_provider_sync.apply().get()
|
||||||
|
assert result == 99
|
||||||
|
assert captured["provider_namespace"] == "mvp_demo"
|
||||||
|
assert captured["job_type"] == IngestionRun.JobType.FULL_SYNC
|
||||||
Reference in New Issue
Block a user