feat(v2): add simple daily extraction-import orchestration
This commit is contained in:
@ -53,6 +53,8 @@ EXTRACTOR_BCL_STATS_URL=
|
||||
EXTRACTOR_BCL_SEASON_LABEL=2025-2026
|
||||
EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID=bcl
|
||||
EXTRACTOR_BCL_COMPETITION_NAME=Basketball Champions League
|
||||
DAILY_ORCHESTRATION_EXTRACTORS=lba,bcl
|
||||
DAILY_ORCHESTRATION_INTERVAL_SECONDS=86400
|
||||
|
||||
# Future optional scheduler loop settings (not enabled in base v2 runtime)
|
||||
SCHEDULER_ENABLED=0
|
||||
|
||||
@ -41,7 +41,7 @@ RUN groupadd --gid "${APP_GID}" "${APP_USER}" \
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
COPY . /app
|
||||
|
||||
RUN chmod +x /app/entrypoint.sh \
|
||||
RUN chmod +x /app/entrypoint.sh /app/scripts/scheduler.sh \
|
||||
&& mkdir -p /app/staticfiles /app/media /app/snapshots/incoming /app/snapshots/archive /app/snapshots/failed \
|
||||
&& chown -R "${APP_UID}:${APP_GID}" /app /opt/venv
|
||||
|
||||
|
||||
43
README.md
43
README.md
@ -19,6 +19,7 @@ Runtime services are intentionally small:
|
||||
- `web` (Django/Gunicorn)
|
||||
- `postgres` (primary DB)
|
||||
- `nginx` (reverse proxy + static/media serving)
|
||||
- optional `scheduler` profile service (runs daily extractor/import loop)
|
||||
|
||||
No Redis/Celery services are part of the v2 default runtime topology.
|
||||
Legacy Celery/provider code is still in repository history/codebase but de-emphasized for v2.
|
||||
@ -60,6 +61,18 @@ docker compose -f docker-compose.yml -f docker-compose.dev.yml up --build
|
||||
docker compose -f docker-compose.yml -f docker-compose.release.yml up -d --build
|
||||
```
|
||||
|
||||
### Start scheduler profile (optional)
|
||||
|
||||
```bash
|
||||
docker compose --profile scheduler up -d scheduler
|
||||
```
|
||||
|
||||
For development override:
|
||||
|
||||
```bash
|
||||
docker compose -f docker-compose.yml -f docker-compose.dev.yml --profile scheduler up -d scheduler
|
||||
```
|
||||
|
||||
## Named Volumes
|
||||
|
||||
v2 runtime uses named volumes for persistence:
|
||||
@ -82,6 +95,7 @@ Core groups:
|
||||
- image tag vars (`APP_IMAGE_TAG`, `NGINX_IMAGE_TAG`)
|
||||
- snapshot directory vars (`STATIC_DATASET_*`)
|
||||
- optional future scheduler vars (`SCHEDULER_*`)
|
||||
- daily orchestration vars (`DAILY_ORCHESTRATION_*`)
|
||||
|
||||
## Snapshot Storage Convention
|
||||
|
||||
@ -155,6 +169,12 @@ Run import:
|
||||
docker compose exec web python manage.py import_snapshots
|
||||
```
|
||||
|
||||
Run end-to-end daily orchestration manually (extractors -> import):
|
||||
|
||||
```bash
|
||||
docker compose exec web python manage.py run_daily_orchestration
|
||||
```
|
||||
|
||||
Command behavior:
|
||||
- scans `STATIC_DATASET_INCOMING_DIR` for `.json` files
|
||||
- validates strict schema
|
||||
@ -217,6 +237,14 @@ Run only the BCL extractor:
|
||||
docker compose exec web python manage.py run_bcl_extractor
|
||||
```
|
||||
|
||||
### Daily orchestration behavior
|
||||
|
||||
`run_daily_orchestration` performs:
|
||||
1. run configured extractors in order from `DAILY_ORCHESTRATION_EXTRACTORS`
|
||||
2. write snapshots to incoming dir
|
||||
3. run `import_snapshots`
|
||||
4. log extractor/import summary
|
||||
|
||||
Extractor environment variables:
|
||||
- `EXTRACTOR_USER_AGENT`
|
||||
- `EXTRACTOR_HTTP_TIMEOUT_SECONDS`
|
||||
@ -234,11 +262,26 @@ Extractor environment variables:
|
||||
- `EXTRACTOR_BCL_SEASON_LABEL`
|
||||
- `EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID`
|
||||
- `EXTRACTOR_BCL_COMPETITION_NAME`
|
||||
- `DAILY_ORCHESTRATION_EXTRACTORS`
|
||||
- `DAILY_ORCHESTRATION_INTERVAL_SECONDS`
|
||||
|
||||
Notes:
|
||||
- extraction is intentionally low-frequency and uses retries conservatively
|
||||
- only public pages/endpoints should be targeted
|
||||
- emitted snapshots must match the same schema consumed by `import_snapshots`
|
||||
- optional scheduler container runs `scripts/scheduler.sh` loop using:
|
||||
- image: `registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}`
|
||||
- command: `/app/scripts/scheduler.sh`
|
||||
- interval: `DAILY_ORCHESTRATION_INTERVAL_SECONDS`
|
||||
|
||||
### Scheduler entrypoint/runtime expectations
|
||||
|
||||
- scheduler uses the same app image and base `entrypoint.sh` as web
|
||||
- scheduler requires database connectivity and snapshot volumes
|
||||
- scheduler is disabled unless:
|
||||
- compose `scheduler` profile is started
|
||||
- `SCHEDULER_ENABLED=1`
|
||||
- this keeps default runtime simple while supporting daily automation
|
||||
|
||||
### LBA extractor assumptions and limitations (MVP)
|
||||
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.utils.dateparse import parse_date
|
||||
|
||||
from apps.ingestion.services.daily_orchestration import run_daily_orchestration
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Run daily HoopScout v2 workflow: extract snapshots, then import snapshots."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--snapshot-date",
|
||||
dest="snapshot_date",
|
||||
default=None,
|
||||
help="Override snapshot date for all extractor outputs (YYYY-MM-DD).",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
snapshot_date = None
|
||||
if options["snapshot_date"]:
|
||||
snapshot_date = parse_date(options["snapshot_date"])
|
||||
if snapshot_date is None:
|
||||
raise CommandError("--snapshot-date must be YYYY-MM-DD.")
|
||||
|
||||
try:
|
||||
result = run_daily_orchestration(snapshot_date=snapshot_date)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
raise CommandError(str(exc)) from exc
|
||||
|
||||
extractor_summary = ", ".join(
|
||||
f"{row.extractor_name}:{row.records_count}" for row in result.extractors_run
|
||||
)
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(
|
||||
"Daily orchestration completed: "
|
||||
f"extractors=[{extractor_summary}] "
|
||||
f"import_run={result.import_run_id} "
|
||||
f"import_status={result.import_status} "
|
||||
f"files_processed={result.files_processed} "
|
||||
f"rows_upserted={result.rows_upserted} "
|
||||
f"rows_failed={result.rows_failed}"
|
||||
)
|
||||
)
|
||||
84
apps/ingestion/services/daily_orchestration.py
Normal file
84
apps/ingestion/services/daily_orchestration.py
Normal file
@ -0,0 +1,84 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from apps.ingestion.extractors import create_extractor
|
||||
from apps.ingestion.services.snapshot_import import SnapshotImporter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractorRunSummary:
|
||||
extractor_name: str
|
||||
records_count: int
|
||||
output_path: Path | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DailyOrchestrationResult:
|
||||
extractors_run: list[ExtractorRunSummary]
|
||||
import_run_id: int
|
||||
import_status: str
|
||||
files_processed: int
|
||||
rows_upserted: int
|
||||
rows_failed: int
|
||||
|
||||
|
||||
def parse_enabled_extractors(raw_value: str) -> list[str]:
|
||||
return [item.strip() for item in raw_value.split(",") if item.strip()]
|
||||
|
||||
|
||||
def run_daily_orchestration(*, snapshot_date: date | None = None) -> DailyOrchestrationResult:
|
||||
extractor_names = parse_enabled_extractors(settings.DAILY_ORCHESTRATION_EXTRACTORS)
|
||||
if not extractor_names:
|
||||
raise ValueError("DAILY_ORCHESTRATION_EXTRACTORS cannot be empty.")
|
||||
|
||||
summaries: list[ExtractorRunSummary] = []
|
||||
for extractor_name in extractor_names:
|
||||
logger.info("daily_orchestration_extractor_start extractor=%s", extractor_name)
|
||||
extractor = create_extractor(extractor_name)
|
||||
result = extractor.run(snapshot_date=snapshot_date)
|
||||
summaries.append(
|
||||
ExtractorRunSummary(
|
||||
extractor_name=extractor_name,
|
||||
records_count=result.records_count,
|
||||
output_path=result.output_path,
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"daily_orchestration_extractor_done extractor=%s records=%s output=%s",
|
||||
extractor_name,
|
||||
result.records_count,
|
||||
result.output_path,
|
||||
)
|
||||
|
||||
importer = SnapshotImporter(
|
||||
incoming_dir=settings.STATIC_DATASET_INCOMING_DIR,
|
||||
archive_dir=settings.STATIC_DATASET_ARCHIVE_DIR,
|
||||
failed_dir=settings.STATIC_DATASET_FAILED_DIR,
|
||||
)
|
||||
import_run = importer.run()
|
||||
logger.info(
|
||||
"daily_orchestration_import_done run_id=%s status=%s files=%s/%s upserted=%s failed=%s",
|
||||
import_run.id,
|
||||
import_run.status,
|
||||
import_run.files_processed,
|
||||
import_run.files_total,
|
||||
import_run.rows_upserted,
|
||||
import_run.rows_failed,
|
||||
)
|
||||
|
||||
return DailyOrchestrationResult(
|
||||
extractors_run=summaries,
|
||||
import_run_id=import_run.id,
|
||||
import_status=import_run.status,
|
||||
files_processed=import_run.files_processed,
|
||||
rows_upserted=import_run.rows_upserted,
|
||||
rows_failed=import_run.rows_failed,
|
||||
)
|
||||
@ -174,6 +174,10 @@ EXTRACTOR_BCL_SEASON_LABEL = os.getenv("EXTRACTOR_BCL_SEASON_LABEL", "").strip()
|
||||
EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = os.getenv("EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID", "bcl").strip()
|
||||
EXTRACTOR_BCL_COMPETITION_NAME = os.getenv("EXTRACTOR_BCL_COMPETITION_NAME", "Basketball Champions League").strip()
|
||||
|
||||
# Simple daily orchestration settings (extract -> import).
|
||||
DAILY_ORCHESTRATION_EXTRACTORS = os.getenv("DAILY_ORCHESTRATION_EXTRACTORS", "lba,bcl")
|
||||
DAILY_ORCHESTRATION_INTERVAL_SECONDS = int(os.getenv("DAILY_ORCHESTRATION_INTERVAL_SECONDS", "86400"))
|
||||
|
||||
if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0:
|
||||
raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.")
|
||||
if EXTRACTOR_HTTP_RETRIES < 0:
|
||||
@ -182,6 +186,8 @@ if EXTRACTOR_RETRY_SLEEP_SECONDS < 0:
|
||||
raise ImproperlyConfigured("EXTRACTOR_RETRY_SLEEP_SECONDS must be >= 0.")
|
||||
if EXTRACTOR_REQUEST_DELAY_SECONDS < 0:
|
||||
raise ImproperlyConfigured("EXTRACTOR_REQUEST_DELAY_SECONDS must be >= 0.")
|
||||
if DAILY_ORCHESTRATION_INTERVAL_SECONDS < 60:
|
||||
raise ImproperlyConfigured("DAILY_ORCHESTRATION_INTERVAL_SECONDS must be >= 60.")
|
||||
|
||||
# Optional scheduler command settings for future v2 snapshot jobs.
|
||||
SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False)
|
||||
|
||||
@ -14,6 +14,14 @@ services:
|
||||
- static_data_dev:/var/www/static:ro
|
||||
- media_data_dev:/var/www/media:ro
|
||||
|
||||
scheduler:
|
||||
user: "${LOCAL_UID:-1000}:${LOCAL_GID:-1000}"
|
||||
volumes:
|
||||
- .:/app
|
||||
- snapshots_incoming_dev:/app/snapshots/incoming
|
||||
- snapshots_archive_dev:/app/snapshots/archive
|
||||
- snapshots_failed_dev:/app/snapshots/failed
|
||||
|
||||
volumes:
|
||||
static_data_dev:
|
||||
media_data_dev:
|
||||
|
||||
@ -67,6 +67,31 @@ services:
|
||||
retries: 5
|
||||
restart: unless-stopped
|
||||
|
||||
scheduler:
|
||||
profiles: ["scheduler"]
|
||||
image: registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
env_file:
|
||||
- .env
|
||||
command: /app/scripts/scheduler.sh
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
user: "10001:10001"
|
||||
volumes:
|
||||
- snapshots_incoming:/app/snapshots/incoming
|
||||
- snapshots_archive:/app/snapshots/archive
|
||||
- snapshots_failed:/app/snapshots/failed
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "grep -qa 'scheduler.sh' /proc/1/cmdline || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
static_data:
|
||||
|
||||
27
scripts/scheduler.sh
Normal file
27
scripts/scheduler.sh
Normal file
@ -0,0 +1,27 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
if [ "${SCHEDULER_ENABLED:-0}" != "1" ]; then
|
||||
echo "Scheduler disabled (SCHEDULER_ENABLED=${SCHEDULER_ENABLED:-0}). Exiting."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
INTERVAL="${DAILY_ORCHESTRATION_INTERVAL_SECONDS:-${SCHEDULER_INTERVAL_SECONDS:-86400}}"
|
||||
if [ "${INTERVAL}" -lt 60 ]; then
|
||||
echo "DAILY_ORCHESTRATION_INTERVAL_SECONDS/SCHEDULER_INTERVAL_SECONDS must be >= 60"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Starting HoopScout scheduler loop interval=${INTERVAL}s"
|
||||
|
||||
while true; do
|
||||
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Running daily orchestration..."
|
||||
if python manage.py run_daily_orchestration; then
|
||||
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Daily orchestration completed successfully."
|
||||
else
|
||||
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Daily orchestration failed."
|
||||
fi
|
||||
|
||||
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Sleeping for ${INTERVAL}s."
|
||||
sleep "${INTERVAL}"
|
||||
done
|
||||
95
tests/test_daily_orchestration.py
Normal file
95
tests/test_daily_orchestration.py
Normal file
@ -0,0 +1,95 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
from django.core.management import call_command
|
||||
|
||||
from apps.ingestion.services.daily_orchestration import parse_enabled_extractors, run_daily_orchestration
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeExtractorResult:
|
||||
records_count: int
|
||||
output_path: str
|
||||
|
||||
|
||||
class _FakeExtractor:
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
|
||||
def run(self, *, snapshot_date=None):
|
||||
if snapshot_date:
|
||||
return _FakeExtractorResult(records_count=3, output_path=f"/tmp/{self.name}-{snapshot_date}.json")
|
||||
return _FakeExtractorResult(records_count=3, output_path=f"/tmp/{self.name}.json")
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeImportRun:
|
||||
id: int = 11
|
||||
status: str = "success"
|
||||
files_processed: int = 2
|
||||
files_total: int = 2
|
||||
rows_upserted: int = 20
|
||||
rows_failed: int = 0
|
||||
|
||||
|
||||
class _FakeImporter:
|
||||
def __init__(self, **_kwargs):
|
||||
pass
|
||||
|
||||
def run(self):
|
||||
return _FakeImportRun()
|
||||
|
||||
|
||||
def test_parse_enabled_extractors():
|
||||
assert parse_enabled_extractors("lba,bcl") == ["lba", "bcl"]
|
||||
assert parse_enabled_extractors(" lba , , bcl ") == ["lba", "bcl"]
|
||||
assert parse_enabled_extractors("") == []
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_daily_orchestration_runs_extractors_then_import(settings, monkeypatch):
|
||||
settings.DAILY_ORCHESTRATION_EXTRACTORS = "lba,bcl"
|
||||
|
||||
monkeypatch.setattr(
|
||||
"apps.ingestion.services.daily_orchestration.create_extractor",
|
||||
lambda name: _FakeExtractor(name),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"apps.ingestion.services.daily_orchestration.SnapshotImporter",
|
||||
_FakeImporter,
|
||||
)
|
||||
|
||||
result = run_daily_orchestration(snapshot_date=date(2026, 3, 13))
|
||||
assert [row.extractor_name for row in result.extractors_run] == ["lba", "bcl"]
|
||||
assert result.import_run_id == 11
|
||||
assert result.import_status == "success"
|
||||
assert result.rows_upserted == 20
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_daily_orchestration_raises_when_no_extractors_configured(settings):
|
||||
settings.DAILY_ORCHESTRATION_EXTRACTORS = ""
|
||||
with pytest.raises(ValueError, match="cannot be empty"):
|
||||
run_daily_orchestration()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_run_daily_orchestration_command(settings, monkeypatch, capsys):
|
||||
settings.DAILY_ORCHESTRATION_EXTRACTORS = "lba,bcl"
|
||||
|
||||
monkeypatch.setattr(
|
||||
"apps.ingestion.services.daily_orchestration.create_extractor",
|
||||
lambda name: _FakeExtractor(name),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"apps.ingestion.services.daily_orchestration.SnapshotImporter",
|
||||
_FakeImporter,
|
||||
)
|
||||
|
||||
call_command("run_daily_orchestration", "--snapshot-date", "2026-03-13")
|
||||
captured = capsys.readouterr()
|
||||
assert "Daily orchestration completed" in captured.out
|
||||
assert "extractors=[lba:3, bcl:3]" in captured.out
|
||||
Reference in New Issue
Block a user