diff --git a/.env.example b/.env.example index 86ad4fc..ecb33a8 100644 --- a/.env.example +++ b/.env.example @@ -53,6 +53,8 @@ EXTRACTOR_BCL_STATS_URL= EXTRACTOR_BCL_SEASON_LABEL=2025-2026 EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID=bcl EXTRACTOR_BCL_COMPETITION_NAME=Basketball Champions League +DAILY_ORCHESTRATION_EXTRACTORS=lba,bcl +DAILY_ORCHESTRATION_INTERVAL_SECONDS=86400 # Future optional scheduler loop settings (not enabled in base v2 runtime) SCHEDULER_ENABLED=0 diff --git a/Dockerfile b/Dockerfile index d28329c..9caa15e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ RUN groupadd --gid "${APP_GID}" "${APP_USER}" \ COPY --from=builder /opt/venv /opt/venv COPY . /app -RUN chmod +x /app/entrypoint.sh \ +RUN chmod +x /app/entrypoint.sh /app/scripts/scheduler.sh \ && mkdir -p /app/staticfiles /app/media /app/snapshots/incoming /app/snapshots/archive /app/snapshots/failed \ && chown -R "${APP_UID}:${APP_GID}" /app /opt/venv diff --git a/README.md b/README.md index 6235ebf..f97453b 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Runtime services are intentionally small: - `web` (Django/Gunicorn) - `postgres` (primary DB) - `nginx` (reverse proxy + static/media serving) +- optional `scheduler` profile service (runs daily extractor/import loop) No Redis/Celery services are part of the v2 default runtime topology. Legacy Celery/provider code is still in repository history/codebase but de-emphasized for v2. @@ -60,6 +61,18 @@ docker compose -f docker-compose.yml -f docker-compose.dev.yml up --build docker compose -f docker-compose.yml -f docker-compose.release.yml up -d --build ``` +### Start scheduler profile (optional) + +```bash +docker compose --profile scheduler up -d scheduler +``` + +For development override: + +```bash +docker compose -f docker-compose.yml -f docker-compose.dev.yml --profile scheduler up -d scheduler +``` + ## Named Volumes v2 runtime uses named volumes for persistence: @@ -82,6 +95,7 @@ Core groups: - image tag vars (`APP_IMAGE_TAG`, `NGINX_IMAGE_TAG`) - snapshot directory vars (`STATIC_DATASET_*`) - optional future scheduler vars (`SCHEDULER_*`) +- daily orchestration vars (`DAILY_ORCHESTRATION_*`) ## Snapshot Storage Convention @@ -155,6 +169,12 @@ Run import: docker compose exec web python manage.py import_snapshots ``` +Run end-to-end daily orchestration manually (extractors -> import): + +```bash +docker compose exec web python manage.py run_daily_orchestration +``` + Command behavior: - scans `STATIC_DATASET_INCOMING_DIR` for `.json` files - validates strict schema @@ -217,6 +237,14 @@ Run only the BCL extractor: docker compose exec web python manage.py run_bcl_extractor ``` +### Daily orchestration behavior + +`run_daily_orchestration` performs: +1. run configured extractors in order from `DAILY_ORCHESTRATION_EXTRACTORS` +2. write snapshots to incoming dir +3. run `import_snapshots` +4. log extractor/import summary + Extractor environment variables: - `EXTRACTOR_USER_AGENT` - `EXTRACTOR_HTTP_TIMEOUT_SECONDS` @@ -234,11 +262,26 @@ Extractor environment variables: - `EXTRACTOR_BCL_SEASON_LABEL` - `EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID` - `EXTRACTOR_BCL_COMPETITION_NAME` +- `DAILY_ORCHESTRATION_EXTRACTORS` +- `DAILY_ORCHESTRATION_INTERVAL_SECONDS` Notes: - extraction is intentionally low-frequency and uses retries conservatively - only public pages/endpoints should be targeted - emitted snapshots must match the same schema consumed by `import_snapshots` +- optional scheduler container runs `scripts/scheduler.sh` loop using: + - image: `registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}` + - command: `/app/scripts/scheduler.sh` + - interval: `DAILY_ORCHESTRATION_INTERVAL_SECONDS` + +### Scheduler entrypoint/runtime expectations + +- scheduler uses the same app image and base `entrypoint.sh` as web +- scheduler requires database connectivity and snapshot volumes +- scheduler is disabled unless: + - compose `scheduler` profile is started + - `SCHEDULER_ENABLED=1` +- this keeps default runtime simple while supporting daily automation ### LBA extractor assumptions and limitations (MVP) diff --git a/apps/ingestion/management/commands/run_daily_orchestration.py b/apps/ingestion/management/commands/run_daily_orchestration.py new file mode 100644 index 0000000..85ed1eb --- /dev/null +++ b/apps/ingestion/management/commands/run_daily_orchestration.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from django.core.management.base import BaseCommand, CommandError +from django.utils.dateparse import parse_date + +from apps.ingestion.services.daily_orchestration import run_daily_orchestration + + +class Command(BaseCommand): + help = "Run daily HoopScout v2 workflow: extract snapshots, then import snapshots." + + def add_arguments(self, parser): + parser.add_argument( + "--snapshot-date", + dest="snapshot_date", + default=None, + help="Override snapshot date for all extractor outputs (YYYY-MM-DD).", + ) + + def handle(self, *args, **options): + snapshot_date = None + if options["snapshot_date"]: + snapshot_date = parse_date(options["snapshot_date"]) + if snapshot_date is None: + raise CommandError("--snapshot-date must be YYYY-MM-DD.") + + try: + result = run_daily_orchestration(snapshot_date=snapshot_date) + except Exception as exc: # noqa: BLE001 + raise CommandError(str(exc)) from exc + + extractor_summary = ", ".join( + f"{row.extractor_name}:{row.records_count}" for row in result.extractors_run + ) + self.stdout.write( + self.style.SUCCESS( + "Daily orchestration completed: " + f"extractors=[{extractor_summary}] " + f"import_run={result.import_run_id} " + f"import_status={result.import_status} " + f"files_processed={result.files_processed} " + f"rows_upserted={result.rows_upserted} " + f"rows_failed={result.rows_failed}" + ) + ) diff --git a/apps/ingestion/services/daily_orchestration.py b/apps/ingestion/services/daily_orchestration.py new file mode 100644 index 0000000..6c4d504 --- /dev/null +++ b/apps/ingestion/services/daily_orchestration.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass +from datetime import date +from pathlib import Path + +from django.conf import settings + +from apps.ingestion.extractors import create_extractor +from apps.ingestion.services.snapshot_import import SnapshotImporter + +logger = logging.getLogger(__name__) + + +@dataclass +class ExtractorRunSummary: + extractor_name: str + records_count: int + output_path: Path | None + + +@dataclass +class DailyOrchestrationResult: + extractors_run: list[ExtractorRunSummary] + import_run_id: int + import_status: str + files_processed: int + rows_upserted: int + rows_failed: int + + +def parse_enabled_extractors(raw_value: str) -> list[str]: + return [item.strip() for item in raw_value.split(",") if item.strip()] + + +def run_daily_orchestration(*, snapshot_date: date | None = None) -> DailyOrchestrationResult: + extractor_names = parse_enabled_extractors(settings.DAILY_ORCHESTRATION_EXTRACTORS) + if not extractor_names: + raise ValueError("DAILY_ORCHESTRATION_EXTRACTORS cannot be empty.") + + summaries: list[ExtractorRunSummary] = [] + for extractor_name in extractor_names: + logger.info("daily_orchestration_extractor_start extractor=%s", extractor_name) + extractor = create_extractor(extractor_name) + result = extractor.run(snapshot_date=snapshot_date) + summaries.append( + ExtractorRunSummary( + extractor_name=extractor_name, + records_count=result.records_count, + output_path=result.output_path, + ) + ) + logger.info( + "daily_orchestration_extractor_done extractor=%s records=%s output=%s", + extractor_name, + result.records_count, + result.output_path, + ) + + importer = SnapshotImporter( + incoming_dir=settings.STATIC_DATASET_INCOMING_DIR, + archive_dir=settings.STATIC_DATASET_ARCHIVE_DIR, + failed_dir=settings.STATIC_DATASET_FAILED_DIR, + ) + import_run = importer.run() + logger.info( + "daily_orchestration_import_done run_id=%s status=%s files=%s/%s upserted=%s failed=%s", + import_run.id, + import_run.status, + import_run.files_processed, + import_run.files_total, + import_run.rows_upserted, + import_run.rows_failed, + ) + + return DailyOrchestrationResult( + extractors_run=summaries, + import_run_id=import_run.id, + import_status=import_run.status, + files_processed=import_run.files_processed, + rows_upserted=import_run.rows_upserted, + rows_failed=import_run.rows_failed, + ) diff --git a/config/settings/base.py b/config/settings/base.py index 94b15d4..c11befe 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -174,6 +174,10 @@ EXTRACTOR_BCL_SEASON_LABEL = os.getenv("EXTRACTOR_BCL_SEASON_LABEL", "").strip() EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = os.getenv("EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID", "bcl").strip() EXTRACTOR_BCL_COMPETITION_NAME = os.getenv("EXTRACTOR_BCL_COMPETITION_NAME", "Basketball Champions League").strip() +# Simple daily orchestration settings (extract -> import). +DAILY_ORCHESTRATION_EXTRACTORS = os.getenv("DAILY_ORCHESTRATION_EXTRACTORS", "lba,bcl") +DAILY_ORCHESTRATION_INTERVAL_SECONDS = int(os.getenv("DAILY_ORCHESTRATION_INTERVAL_SECONDS", "86400")) + if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0: raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.") if EXTRACTOR_HTTP_RETRIES < 0: @@ -182,6 +186,8 @@ if EXTRACTOR_RETRY_SLEEP_SECONDS < 0: raise ImproperlyConfigured("EXTRACTOR_RETRY_SLEEP_SECONDS must be >= 0.") if EXTRACTOR_REQUEST_DELAY_SECONDS < 0: raise ImproperlyConfigured("EXTRACTOR_REQUEST_DELAY_SECONDS must be >= 0.") +if DAILY_ORCHESTRATION_INTERVAL_SECONDS < 60: + raise ImproperlyConfigured("DAILY_ORCHESTRATION_INTERVAL_SECONDS must be >= 60.") # Optional scheduler command settings for future v2 snapshot jobs. SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 9323720..fc19dec 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -14,6 +14,14 @@ services: - static_data_dev:/var/www/static:ro - media_data_dev:/var/www/media:ro + scheduler: + user: "${LOCAL_UID:-1000}:${LOCAL_GID:-1000}" + volumes: + - .:/app + - snapshots_incoming_dev:/app/snapshots/incoming + - snapshots_archive_dev:/app/snapshots/archive + - snapshots_failed_dev:/app/snapshots/failed + volumes: static_data_dev: media_data_dev: diff --git a/docker-compose.yml b/docker-compose.yml index d45d540..8ef73bf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -67,6 +67,31 @@ services: retries: 5 restart: unless-stopped + scheduler: + profiles: ["scheduler"] + image: registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest} + build: + context: . + dockerfile: Dockerfile + env_file: + - .env + command: /app/scripts/scheduler.sh + depends_on: + postgres: + condition: service_healthy + user: "10001:10001" + volumes: + - snapshots_incoming:/app/snapshots/incoming + - snapshots_archive:/app/snapshots/archive + - snapshots_failed:/app/snapshots/failed + healthcheck: + test: ["CMD-SHELL", "grep -qa 'scheduler.sh' /proc/1/cmdline || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 20s + restart: unless-stopped + volumes: postgres_data: static_data: diff --git a/scripts/scheduler.sh b/scripts/scheduler.sh new file mode 100644 index 0000000..326faea --- /dev/null +++ b/scripts/scheduler.sh @@ -0,0 +1,27 @@ +#!/bin/sh +set -e + +if [ "${SCHEDULER_ENABLED:-0}" != "1" ]; then + echo "Scheduler disabled (SCHEDULER_ENABLED=${SCHEDULER_ENABLED:-0}). Exiting." + exit 0 +fi + +INTERVAL="${DAILY_ORCHESTRATION_INTERVAL_SECONDS:-${SCHEDULER_INTERVAL_SECONDS:-86400}}" +if [ "${INTERVAL}" -lt 60 ]; then + echo "DAILY_ORCHESTRATION_INTERVAL_SECONDS/SCHEDULER_INTERVAL_SECONDS must be >= 60" + exit 1 +fi + +echo "Starting HoopScout scheduler loop interval=${INTERVAL}s" + +while true; do + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Running daily orchestration..." + if python manage.py run_daily_orchestration; then + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Daily orchestration completed successfully." + else + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Daily orchestration failed." + fi + + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Sleeping for ${INTERVAL}s." + sleep "${INTERVAL}" +done diff --git a/tests/test_daily_orchestration.py b/tests/test_daily_orchestration.py new file mode 100644 index 0000000..01c8122 --- /dev/null +++ b/tests/test_daily_orchestration.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import date + +import pytest +from django.core.management import call_command + +from apps.ingestion.services.daily_orchestration import parse_enabled_extractors, run_daily_orchestration + + +@dataclass +class _FakeExtractorResult: + records_count: int + output_path: str + + +class _FakeExtractor: + def __init__(self, name: str): + self.name = name + + def run(self, *, snapshot_date=None): + if snapshot_date: + return _FakeExtractorResult(records_count=3, output_path=f"/tmp/{self.name}-{snapshot_date}.json") + return _FakeExtractorResult(records_count=3, output_path=f"/tmp/{self.name}.json") + + +@dataclass +class _FakeImportRun: + id: int = 11 + status: str = "success" + files_processed: int = 2 + files_total: int = 2 + rows_upserted: int = 20 + rows_failed: int = 0 + + +class _FakeImporter: + def __init__(self, **_kwargs): + pass + + def run(self): + return _FakeImportRun() + + +def test_parse_enabled_extractors(): + assert parse_enabled_extractors("lba,bcl") == ["lba", "bcl"] + assert parse_enabled_extractors(" lba , , bcl ") == ["lba", "bcl"] + assert parse_enabled_extractors("") == [] + + +@pytest.mark.django_db +def test_daily_orchestration_runs_extractors_then_import(settings, monkeypatch): + settings.DAILY_ORCHESTRATION_EXTRACTORS = "lba,bcl" + + monkeypatch.setattr( + "apps.ingestion.services.daily_orchestration.create_extractor", + lambda name: _FakeExtractor(name), + ) + monkeypatch.setattr( + "apps.ingestion.services.daily_orchestration.SnapshotImporter", + _FakeImporter, + ) + + result = run_daily_orchestration(snapshot_date=date(2026, 3, 13)) + assert [row.extractor_name for row in result.extractors_run] == ["lba", "bcl"] + assert result.import_run_id == 11 + assert result.import_status == "success" + assert result.rows_upserted == 20 + + +@pytest.mark.django_db +def test_daily_orchestration_raises_when_no_extractors_configured(settings): + settings.DAILY_ORCHESTRATION_EXTRACTORS = "" + with pytest.raises(ValueError, match="cannot be empty"): + run_daily_orchestration() + + +@pytest.mark.django_db +def test_run_daily_orchestration_command(settings, monkeypatch, capsys): + settings.DAILY_ORCHESTRATION_EXTRACTORS = "lba,bcl" + + monkeypatch.setattr( + "apps.ingestion.services.daily_orchestration.create_extractor", + lambda name: _FakeExtractor(name), + ) + monkeypatch.setattr( + "apps.ingestion.services.daily_orchestration.SnapshotImporter", + _FakeImporter, + ) + + call_command("run_daily_orchestration", "--snapshot-date", "2026-03-13") + captured = capsys.readouterr() + assert "Daily orchestration completed" in captured.out + assert "extractors=[lba:3, bcl:3]" in captured.out