diff --git a/.env.example b/.env.example index b3a1dbb..d9a4cec 100644 --- a/.env.example +++ b/.env.example @@ -31,10 +31,10 @@ POSTGRES_PORT=5432 LOCAL_UID=1000 LOCAL_GID=1000 -# Snapshot storage (volume-backed directories) -SNAPSHOT_INCOMING_DIR=/app/snapshots/incoming -SNAPSHOT_ARCHIVE_DIR=/app/snapshots/archive -SNAPSHOT_FAILED_DIR=/app/snapshots/failed +# Static dataset storage (volume-backed directories) +STATIC_DATASET_INCOMING_DIR=/app/snapshots/incoming +STATIC_DATASET_ARCHIVE_DIR=/app/snapshots/archive +STATIC_DATASET_FAILED_DIR=/app/snapshots/failed # Future optional scheduler loop settings (not enabled in base v2 runtime) SCHEDULER_ENABLED=0 diff --git a/README.md b/README.md index 188f668..27a83f3 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,9 @@ Current v2 foundation scope in this branch: - nginx reverse proxy - management-command-driven runtime operations - static snapshot directories persisted via Docker named volumes +- strict JSON snapshot schema + import management command Out of scope in this step: -- domain model redesign -- snapshot importer implementation - extractor implementation ## Runtime Architecture (v2) @@ -81,7 +80,7 @@ Core groups: - Django runtime/security vars - PostgreSQL connection vars - image tag vars (`APP_IMAGE_TAG`, `NGINX_IMAGE_TAG`) -- snapshot directory vars (`SNAPSHOT_*`) +- snapshot directory vars (`STATIC_DATASET_*`) - optional future scheduler vars (`SCHEDULER_*`) ## Snapshot Storage Convention @@ -91,7 +90,84 @@ Snapshot files are expected under: - archive: `/app/snapshots/archive` - failed: `/app/snapshots/failed` -In this foundation step, directories are created and persisted but no importer/extractor is implemented yet. +Configured via environment: +- `STATIC_DATASET_INCOMING_DIR` +- `STATIC_DATASET_ARCHIVE_DIR` +- `STATIC_DATASET_FAILED_DIR` + +## Snapshot JSON Schema (MVP) + +Each file must be a JSON object: + +```json +{ + "source_name": "official_site_feed", + "snapshot_date": "2026-03-13", + "records": [ + { + "competition_external_id": "comp-nba", + "competition_name": "NBA", + "season": "2025-2026", + "team_external_id": "team-lal", + "team_name": "Los Angeles Lakers", + "player_external_id": "player-23", + "full_name": "LeBron James", + "first_name": "LeBron", + "last_name": "James", + "birth_date": "1984-12-30", + "nationality": "US", + "height_cm": 206, + "weight_kg": 113, + "position": "SF", + "role": "Primary Creator", + "games_played": 60, + "minutes_per_game": 34.5, + "points_per_game": 25.4, + "rebounds_per_game": 7.2, + "assists_per_game": 8.1, + "steals_per_game": 1.3, + "blocks_per_game": 0.7, + "turnovers_per_game": 3.2, + "fg_pct": 51.1, + "three_pt_pct": 38.4, + "ft_pct": 79.8, + "source_metadata": {}, + "raw_payload": {} + } + ], + "source_metadata": {}, + "raw_payload": {} +} +``` + +Validation is strict: +- unknown fields are rejected +- required fields must exist +- `snapshot_date` and `birth_date` must be `YYYY-MM-DD` +- numeric fields must be numeric +- invalid files are moved to failed directory + +## Import Command + +Run import: + +```bash +docker compose exec web python manage.py import_snapshots +``` + +Command behavior: +- scans `STATIC_DATASET_INCOMING_DIR` for `.json` files +- validates strict schema +- computes SHA-256 checksum +- creates `ImportRun` + `ImportFile` records +- upserts relational entities (`Competition`, `Season`, `Team`, `Player`, `PlayerSeason`, `PlayerSeasonStats`) +- skips duplicate content using checksum +- moves valid files to archive +- moves invalid files to failed + +Import history is visible in Django admin: +- `ImportRun` +- `ImportFile` ## Migration and Superuser Commands diff --git a/apps/ingestion/admin.py b/apps/ingestion/admin.py index 9fce1f9..fece4c6 100644 --- a/apps/ingestion/admin.py +++ b/apps/ingestion/admin.py @@ -8,6 +8,8 @@ class ImportFileInline(admin.TabularInline): extra = 0 readonly_fields = ( "relative_path", + "source_name", + "snapshot_date", "status", "checksum", "file_size_bytes", @@ -61,6 +63,8 @@ class ImportFileAdmin(admin.ModelAdmin): "id", "import_run", "relative_path", + "source_name", + "snapshot_date", "status", "rows_total", "rows_upserted", @@ -68,10 +72,12 @@ class ImportFileAdmin(admin.ModelAdmin): "processed_at", ) list_filter = ("status",) - search_fields = ("relative_path", "checksum", "error_message") + search_fields = ("relative_path", "source_name", "checksum", "error_message") readonly_fields = ( "import_run", "relative_path", + "source_name", + "snapshot_date", "status", "checksum", "file_size_bytes", diff --git a/apps/ingestion/management/__init__.py b/apps/ingestion/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/ingestion/management/commands/__init__.py b/apps/ingestion/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/ingestion/management/commands/import_snapshots.py b/apps/ingestion/management/commands/import_snapshots.py new file mode 100644 index 0000000..85eece5 --- /dev/null +++ b/apps/ingestion/management/commands/import_snapshots.py @@ -0,0 +1,23 @@ +from django.conf import settings +from django.core.management.base import BaseCommand + +from apps.ingestion.services.snapshot_import import SnapshotImporter + + +class Command(BaseCommand): + help = "Import static JSON snapshots from incoming directory into PostgreSQL." + + def handle(self, *args, **options): + importer = SnapshotImporter( + incoming_dir=settings.STATIC_DATASET_INCOMING_DIR, + archive_dir=settings.STATIC_DATASET_ARCHIVE_DIR, + failed_dir=settings.STATIC_DATASET_FAILED_DIR, + ) + run = importer.run() + self.stdout.write( + self.style.SUCCESS( + f"Import run {run.id} completed: status={run.status} " + f"files={run.files_processed}/{run.files_total} " + f"rows_upserted={run.rows_upserted} rows_failed={run.rows_failed}" + ) + ) diff --git a/apps/ingestion/migrations/0004_importfile_snapshot_date_importfile_source_name_and_more.py b/apps/ingestion/migrations/0004_importfile_snapshot_date_importfile_source_name_and_more.py new file mode 100644 index 0000000..346b732 --- /dev/null +++ b/apps/ingestion/migrations/0004_importfile_snapshot_date_importfile_source_name_and_more.py @@ -0,0 +1,27 @@ +# Generated by Django 5.2.12 on 2026-03-13 12:59 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingestion', '0003_importrun_importfile_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='importfile', + name='snapshot_date', + field=models.DateField(blank=True, null=True), + ), + migrations.AddField( + model_name='importfile', + name='source_name', + field=models.CharField(blank=True, max_length=120), + ), + migrations.AddIndex( + model_name='importfile', + index=models.Index(fields=['source_name', 'snapshot_date'], name='ingestion_i_source__de6843_idx'), + ), + ] diff --git a/apps/ingestion/models.py b/apps/ingestion/models.py index 3d6b0a1..4289dd7 100644 --- a/apps/ingestion/models.py +++ b/apps/ingestion/models.py @@ -57,6 +57,8 @@ class ImportFile(models.Model): related_name="files", ) relative_path = models.CharField(max_length=260) + source_name = models.CharField(max_length=120, blank=True) + snapshot_date = models.DateField(blank=True, null=True) status = models.CharField(max_length=24, choices=FileStatus.choices, default=FileStatus.PENDING) checksum = models.CharField(max_length=128, blank=True) file_size_bytes = models.PositiveBigIntegerField(blank=True, null=True) @@ -79,6 +81,7 @@ class ImportFile(models.Model): indexes = [ models.Index(fields=["import_run", "status"]), models.Index(fields=["relative_path"]), + models.Index(fields=["source_name", "snapshot_date"]), models.Index(fields=["processed_at"]), ] diff --git a/apps/ingestion/services/snapshot_import.py b/apps/ingestion/services/snapshot_import.py new file mode 100644 index 0000000..4640db8 --- /dev/null +++ b/apps/ingestion/services/snapshot_import.py @@ -0,0 +1,310 @@ +from __future__ import annotations + +import hashlib +import json +import shutil +from dataclasses import dataclass +from datetime import date, datetime +from decimal import Decimal +from pathlib import Path +from typing import Any + +from django.db import transaction +from django.template.defaultfilters import slugify +from django.utils import timezone +from django.utils.dateparse import parse_date + +from apps.competitions.models import Competition, Season +from apps.ingestion.models import ImportFile, ImportRun +from apps.ingestion.snapshots import SnapshotSchemaValidator, SnapshotValidationError +from apps.players.models import Nationality, Player, Position, Role +from apps.stats.models import PlayerSeason, PlayerSeasonStats +from apps.teams.models import Team + + +@dataclass +class ImportSummary: + files_total: int = 0 + files_processed: int = 0 + rows_total: int = 0 + rows_upserted: int = 0 + rows_failed: int = 0 + + +def _safe_move(src: Path, destination_dir: Path) -> Path: + destination_dir.mkdir(parents=True, exist_ok=True) + candidate = destination_dir / src.name + if candidate.exists(): + ts = datetime.utcnow().strftime("%Y%m%d%H%M%S") + candidate = destination_dir / f"{src.stem}-{ts}{src.suffix}" + shutil.move(str(src), str(candidate)) + return candidate + + +def _file_checksum(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _normalize_decimal(value: float | int | str) -> Decimal: + return Decimal(str(value)).quantize(Decimal("0.01")) + + +def _parse_season_dates(label: str) -> tuple[date, date]: + if "-" in label: + first = label.split("-", 1)[0] + else: + first = label + year = int(first) + return date(year, 9, 1), date(year + 1, 7, 31) + + +def _resolve_nationality(value: str | None) -> Nationality | None: + if not value: + return None + token = value.strip() + if not token: + return None + if len(token) == 2: + code = token.upper() + obj, _ = Nationality.objects.get_or_create( + iso2_code=code, + defaults={"name": code}, + ) + return obj + return Nationality.objects.filter(name__iexact=token).first() + + +def _position_code(position_value: str) -> str: + token = position_value.strip().upper().replace(" ", "_") + return (token[:10] or "UNK") + + +def _role_code(role_value: str) -> str: + token = slugify(role_value).replace("-", "_") + return (token[:32] or "unknown") + + +def _player_season_source_uid(record: dict[str, Any], source_name: str, snapshot_date: date) -> str: + return ( + f"{source_name}:{snapshot_date.isoformat()}:" + f"{record['competition_external_id']}:{record['season']}:" + f"{record['team_external_id']}:{record['player_external_id']}" + ) + + +def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: date) -> None: + competition_slug = slugify(record["competition_name"]) or f"competition-{record['competition_external_id']}" + competition, _ = Competition.objects.update_or_create( + source_uid=record["competition_external_id"], + defaults={ + "name": record["competition_name"], + "slug": competition_slug, + "competition_type": Competition.CompetitionType.LEAGUE, + "is_active": True, + }, + ) + + start_date, end_date = _parse_season_dates(record["season"]) + season, _ = Season.objects.update_or_create( + source_uid=f"season:{record['season']}", + defaults={ + "label": record["season"], + "start_date": start_date, + "end_date": end_date, + "is_current": False, + }, + ) + + team_slug = slugify(record["team_name"]) or f"team-{record['team_external_id']}" + team, _ = Team.objects.update_or_create( + source_uid=record["team_external_id"], + defaults={ + "name": record["team_name"], + "slug": team_slug, + "short_name": "", + }, + ) + + position, _ = Position.objects.get_or_create( + code=_position_code(record["position"]), + defaults={"name": record["position"]}, + ) + role = None + if record.get("role"): + role, _ = Role.objects.get_or_create( + code=_role_code(record["role"]), + defaults={"name": record["role"]}, + ) + + player, _ = Player.objects.update_or_create( + source_uid=record["player_external_id"], + defaults={ + "first_name": record["first_name"], + "last_name": record["last_name"], + "full_name": record["full_name"], + "birth_date": parse_date(record["birth_date"]), + "nationality": _resolve_nationality(record.get("nationality")), + "nominal_position": position, + "inferred_role": role, + "height_cm": record["height_cm"], + "weight_kg": record["weight_kg"], + "is_active": True, + }, + ) + + player_season, _ = PlayerSeason.objects.update_or_create( + source_uid=_player_season_source_uid(record, source_name=source_name, snapshot_date=snapshot_date), + defaults={ + "player": player, + "season": season, + "team": team, + "competition": competition, + "games_played": int(record["games_played"]), + "games_started": 0, + "minutes_played": int(round(float(record["minutes_per_game"]) * int(record["games_played"]))), + }, + ) + + PlayerSeasonStats.objects.update_or_create( + player_season=player_season, + defaults={ + "points": _normalize_decimal(record["points_per_game"]), + "rebounds": _normalize_decimal(record["rebounds_per_game"]), + "assists": _normalize_decimal(record["assists_per_game"]), + "steals": _normalize_decimal(record["steals_per_game"]), + "blocks": _normalize_decimal(record["blocks_per_game"]), + "turnovers": _normalize_decimal(record["turnovers_per_game"]), + "fg_pct": _normalize_decimal(record["fg_pct"]), + "three_pct": _normalize_decimal(record["three_pt_pct"]), + "ft_pct": _normalize_decimal(record["ft_pct"]), + }, + ) + + +class SnapshotImporter: + def __init__(self, *, incoming_dir: str, archive_dir: str, failed_dir: str): + self.incoming_dir = Path(incoming_dir) + self.archive_dir = Path(archive_dir) + self.failed_dir = Path(failed_dir) + + def _list_input_files(self) -> list[Path]: + if not self.incoming_dir.exists(): + return [] + return sorted(path for path in self.incoming_dir.iterdir() if path.is_file() and path.suffix.lower() == ".json") + + def run(self, *, triggered_by=None) -> ImportRun: + run = ImportRun.objects.create( + source="static_snapshot_json", + status=ImportRun.RunStatus.RUNNING, + triggered_by=triggered_by, + started_at=timezone.now(), + context={ + "incoming_dir": str(self.incoming_dir), + "archive_dir": str(self.archive_dir), + "failed_dir": str(self.failed_dir), + }, + ) + + summary = ImportSummary() + files = self._list_input_files() + summary.files_total = len(files) + + for path in files: + checksum = _file_checksum(path) + file_row = ImportFile.objects.create( + import_run=run, + relative_path=path.name, + status=ImportFile.FileStatus.PROCESSING, + checksum=checksum, + file_size_bytes=path.stat().st_size, + ) + + # Duplicate file content previously imported successfully. + already_imported = ImportFile.objects.filter( + checksum=checksum, + status=ImportFile.FileStatus.SUCCESS, + ).exclude(pk=file_row.pk).exists() + if already_imported: + file_row.status = ImportFile.FileStatus.SKIPPED + file_row.error_message = "Skipped duplicate checksum already imported successfully." + file_row.processed_at = timezone.now() + file_row.save(update_fields=["status", "error_message", "processed_at"]) + _safe_move(path, self.archive_dir) + summary.files_processed += 1 + continue + + try: + payload = json.loads(path.read_text(encoding="utf-8")) + validated = SnapshotSchemaValidator.validate(payload) + + file_row.source_name = validated.source_name + file_row.snapshot_date = validated.snapshot_date + file_row.rows_total = len(validated.records) + + with transaction.atomic(): + for record in validated.records: + _upsert_record(record, source_name=validated.source_name, snapshot_date=validated.snapshot_date) + + file_row.status = ImportFile.FileStatus.SUCCESS + file_row.rows_upserted = len(validated.records) + file_row.payload_preview = { + "source_name": validated.source_name, + "snapshot_date": validated.snapshot_date.isoformat(), + "sample_record": validated.records[0], + } + _safe_move(path, self.archive_dir) + except (json.JSONDecodeError, SnapshotValidationError, ValueError) as exc: + file_row.status = ImportFile.FileStatus.FAILED + file_row.error_message = str(exc) + _safe_move(path, self.failed_dir) + except Exception as exc: # noqa: BLE001 + file_row.status = ImportFile.FileStatus.FAILED + file_row.error_message = f"Unhandled import error: {exc}" + _safe_move(path, self.failed_dir) + + file_row.processed_at = timezone.now() + file_row.save( + update_fields=[ + "source_name", + "snapshot_date", + "status", + "rows_total", + "rows_upserted", + "rows_failed", + "error_message", + "payload_preview", + "processed_at", + ] + ) + + summary.files_processed += 1 + summary.rows_total += file_row.rows_total + summary.rows_upserted += file_row.rows_upserted + summary.rows_failed += file_row.rows_failed + (1 if file_row.status == ImportFile.FileStatus.FAILED else 0) + + run.status = ImportRun.RunStatus.SUCCESS if summary.rows_failed == 0 else ImportRun.RunStatus.FAILED + run.files_total = summary.files_total + run.files_processed = summary.files_processed + run.rows_total = summary.rows_total + run.rows_upserted = summary.rows_upserted + run.rows_failed = summary.rows_failed + run.finished_at = timezone.now() + if summary.rows_failed: + run.error_summary = f"{summary.rows_failed} file/row import error(s)." + run.save( + update_fields=[ + "status", + "files_total", + "files_processed", + "rows_total", + "rows_upserted", + "rows_failed", + "error_summary", + "finished_at", + ] + ) + return run diff --git a/apps/ingestion/snapshots/__init__.py b/apps/ingestion/snapshots/__init__.py new file mode 100644 index 0000000..8bda137 --- /dev/null +++ b/apps/ingestion/snapshots/__init__.py @@ -0,0 +1,3 @@ +from .schema import SnapshotSchemaValidator, SnapshotValidationError, SnapshotValidationResult + +__all__ = ["SnapshotSchemaValidator", "SnapshotValidationError", "SnapshotValidationResult"] diff --git a/apps/ingestion/snapshots/schema.py b/apps/ingestion/snapshots/schema.py new file mode 100644 index 0000000..07943c3 --- /dev/null +++ b/apps/ingestion/snapshots/schema.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import date +from typing import Any + +from django.utils.dateparse import parse_date + +REQUIRED_RECORD_FIELDS = { + "competition_external_id", + "competition_name", + "season", + "team_external_id", + "team_name", + "player_external_id", + "full_name", + "first_name", + "last_name", + "birth_date", + "nationality", + "height_cm", + "weight_kg", + "position", + "games_played", + "minutes_per_game", + "points_per_game", + "rebounds_per_game", + "assists_per_game", + "steals_per_game", + "blocks_per_game", + "turnovers_per_game", + "fg_pct", + "three_pt_pct", + "ft_pct", +} + +ALLOWED_TOP_LEVEL_FIELDS = { + "source_name", + "snapshot_date", + "records", + "source_metadata", + "raw_payload", +} + +ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | { + "role", + "source_metadata", + "raw_payload", +} + + +@dataclass +class SnapshotValidationResult: + source_name: str + snapshot_date: date + records: list[dict[str, Any]] + + +class SnapshotValidationError(ValueError): + pass + + +class SnapshotSchemaValidator: + """Strict JSON schema validator for HoopScout v2 player-season snapshots.""" + + @staticmethod + def _require_string(value: Any, field: str) -> str: + if not isinstance(value, str) or not value.strip(): + raise SnapshotValidationError(f"{field} must be a non-empty string") + return value.strip() + + @staticmethod + def _require_non_negative_int(value: Any, field: str) -> int: + if isinstance(value, bool): + raise SnapshotValidationError(f"{field} must be a non-negative integer") + try: + parsed = int(value) + except (TypeError, ValueError) as exc: + raise SnapshotValidationError(f"{field} must be a non-negative integer") from exc + if parsed < 0: + raise SnapshotValidationError(f"{field} must be a non-negative integer") + return parsed + + @staticmethod + def _require_float(value: Any, field: str) -> float: + try: + parsed = float(value) + except (TypeError, ValueError) as exc: + raise SnapshotValidationError(f"{field} must be numeric") from exc + return parsed + + @classmethod + def _validate_record(cls, record: dict[str, Any], index: int) -> dict[str, Any]: + unknown = set(record.keys()) - ALLOWED_RECORD_FIELDS + if unknown: + raise SnapshotValidationError( + f"record[{index}] contains unknown fields: {', '.join(sorted(unknown))}" + ) + + missing = REQUIRED_RECORD_FIELDS - set(record.keys()) + if missing: + raise SnapshotValidationError( + f"record[{index}] missing required fields: {', '.join(sorted(missing))}" + ) + + normalized = dict(record) + for field in ( + "competition_external_id", + "competition_name", + "season", + "team_external_id", + "team_name", + "player_external_id", + "full_name", + "first_name", + "last_name", + "nationality", + "position", + ): + normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}") + + if record.get("role") is not None: + normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role") + + birth_date = parse_date(str(record.get("birth_date"))) + if not birth_date: + raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD") + normalized["birth_date"] = birth_date.isoformat() + + normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm") + normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg") + normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played") + + for field in ( + "minutes_per_game", + "points_per_game", + "rebounds_per_game", + "assists_per_game", + "steals_per_game", + "blocks_per_game", + "turnovers_per_game", + "fg_pct", + "three_pt_pct", + "ft_pct", + ): + normalized[field] = cls._require_float(record.get(field), f"record[{index}].{field}") + + return normalized + + @classmethod + def validate(cls, payload: dict[str, Any]) -> SnapshotValidationResult: + if not isinstance(payload, dict): + raise SnapshotValidationError("Snapshot root must be an object") + + unknown = set(payload.keys()) - ALLOWED_TOP_LEVEL_FIELDS + if unknown: + raise SnapshotValidationError( + f"Snapshot contains unknown top-level fields: {', '.join(sorted(unknown))}" + ) + + source_name = cls._require_string(payload.get("source_name"), "source_name") + + snapshot_date_raw = payload.get("snapshot_date") + snapshot_date = parse_date(str(snapshot_date_raw)) + if not snapshot_date: + raise SnapshotValidationError("snapshot_date must be YYYY-MM-DD") + + records = payload.get("records") + if not isinstance(records, list) or not records: + raise SnapshotValidationError("records must be a non-empty array") + + normalized_records: list[dict[str, Any]] = [] + for index, record in enumerate(records): + if not isinstance(record, dict): + raise SnapshotValidationError(f"record[{index}] must be an object") + normalized_records.append(cls._validate_record(record, index=index)) + + return SnapshotValidationResult( + source_name=source_name, + snapshot_date=snapshot_date, + records=normalized_records, + ) diff --git a/config/settings/base.py b/config/settings/base.py index 72c19d7..e5cd367 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -142,10 +142,19 @@ LOGIN_URL = "users:login" LOGIN_REDIRECT_URL = "core:dashboard" LOGOUT_REDIRECT_URL = "core:home" -# HoopScout v2 snapshot storage (volume-backed directories). -SNAPSHOT_INCOMING_DIR = os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming")) -SNAPSHOT_ARCHIVE_DIR = os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive")) -SNAPSHOT_FAILED_DIR = os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")) +# HoopScout v2 static dataset storage (volume-backed directories). +STATIC_DATASET_INCOMING_DIR = os.getenv( + "STATIC_DATASET_INCOMING_DIR", + os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming")), +) +STATIC_DATASET_ARCHIVE_DIR = os.getenv( + "STATIC_DATASET_ARCHIVE_DIR", + os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive")), +) +STATIC_DATASET_FAILED_DIR = os.getenv( + "STATIC_DATASET_FAILED_DIR", + os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")), +) # Optional scheduler command settings for future v2 snapshot jobs. SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False) diff --git a/entrypoint.sh b/entrypoint.sh index f4f793e..ffbbc1b 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -8,9 +8,9 @@ done echo "PostgreSQL is available." -mkdir -p "${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}" \ - "${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}" \ - "${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}" +mkdir -p "${STATIC_DATASET_INCOMING_DIR:-${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}}" \ + "${STATIC_DATASET_ARCHIVE_DIR:-${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}}" \ + "${STATIC_DATASET_FAILED_DIR:-${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}}" if [ "${DJANGO_SETTINGS_MODULE:-}" = "config.settings.production" ] && [ "$1" = "gunicorn" ]; then echo "Running Django deployment checks..." diff --git a/tests/test_import_snapshots_command.py b/tests/test_import_snapshots_command.py new file mode 100644 index 0000000..67b279f --- /dev/null +++ b/tests/test_import_snapshots_command.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +import json +from datetime import date +from pathlib import Path + +import pytest +from django.core.management import call_command + +from apps.competitions.models import Competition, Season +from apps.ingestion.models import ImportFile, ImportRun +from apps.players.models import Player +from apps.stats.models import PlayerSeason, PlayerSeasonStats +from apps.teams.models import Team + + +def _valid_payload() -> dict: + return { + "source_name": "official_site_feed", + "snapshot_date": "2026-03-13", + "records": [ + { + "competition_external_id": "comp-nba", + "competition_name": "NBA", + "season": "2025-2026", + "team_external_id": "team-lal", + "team_name": "Los Angeles Lakers", + "player_external_id": "player-23", + "full_name": "LeBron James", + "first_name": "LeBron", + "last_name": "James", + "birth_date": "1984-12-30", + "nationality": "US", + "height_cm": 206, + "weight_kg": 113, + "position": "SF", + "role": "Primary Creator", + "games_played": 60, + "minutes_per_game": 34.5, + "points_per_game": 25.4, + "rebounds_per_game": 7.2, + "assists_per_game": 8.1, + "steals_per_game": 1.3, + "blocks_per_game": 0.7, + "turnovers_per_game": 3.2, + "fg_pct": 51.1, + "three_pt_pct": 38.4, + "ft_pct": 79.8, + } + ], + } + + +def _write_json(path: Path, payload: dict) -> None: + path.write_text(json.dumps(payload), encoding="utf-8") + + +@pytest.mark.django_db +def test_valid_snapshot_import(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + file_path = incoming / "nba-2026-03-13.json" + _write_json(file_path, payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + run = ImportRun.objects.get() + assert run.status == ImportRun.RunStatus.SUCCESS + assert run.files_processed == 1 + assert run.rows_upserted == 1 + + import_file = ImportFile.objects.get(import_run=run) + assert import_file.status == ImportFile.FileStatus.SUCCESS + assert import_file.source_name == "official_site_feed" + assert import_file.snapshot_date == date(2026, 3, 13) + + assert (archive / "nba-2026-03-13.json").exists() + assert not (incoming / "nba-2026-03-13.json").exists() + + assert Competition.objects.filter(source_uid="comp-nba").exists() + assert Team.objects.filter(source_uid="team-lal").exists() + assert Player.objects.filter(source_uid="player-23").exists() + assert Season.objects.filter(source_uid="season:2025-2026").exists() + assert PlayerSeason.objects.count() == 1 + assert PlayerSeasonStats.objects.count() == 1 + + +@pytest.mark.django_db +def test_invalid_snapshot_rejected_and_moved_to_failed(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + del payload["records"][0]["points_per_game"] + file_path = incoming / "broken.json" + _write_json(file_path, payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + run = ImportRun.objects.get() + assert run.status == ImportRun.RunStatus.FAILED + + import_file = ImportFile.objects.get(import_run=run) + assert import_file.status == ImportFile.FileStatus.FAILED + assert "missing required fields" in import_file.error_message + + assert (failed / "broken.json").exists() + assert not (archive / "broken.json").exists() + assert not Competition.objects.exists() + + +@pytest.mark.django_db +def test_idempotent_reimport_uses_checksum_and_skips_duplicate(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + _write_json(incoming / "first.json", payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + assert Competition.objects.count() == 1 + assert Player.objects.count() == 1 + + # Re-drop same content with different filename. + _write_json(incoming / "first-duplicate.json", payload) + call_command("import_snapshots") + + assert Competition.objects.count() == 1 + assert Player.objects.count() == 1 + assert PlayerSeason.objects.count() == 1 + + duplicate_file = ImportFile.objects.filter(relative_path="first-duplicate.json").order_by("-id").first() + assert duplicate_file is not None + assert duplicate_file.status == ImportFile.FileStatus.SKIPPED + assert duplicate_file.checksum + assert "duplicate checksum" in duplicate_file.error_message.lower() + + assert (archive / "first-duplicate.json").exists() + + +@pytest.mark.django_db +def test_same_run_second_file_same_checksum_is_skipped(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + _write_json(incoming / "a.json", payload) + _write_json(incoming / "b.json", payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + files = {row.relative_path: row for row in ImportFile.objects.order_by("relative_path")} + assert files["a.json"].status == ImportFile.FileStatus.SUCCESS + assert files["b.json"].status == ImportFile.FileStatus.SKIPPED + assert files["a.json"].checksum == files["b.json"].checksum