Add v2 snapshot schema validation and import_snapshots command

This commit is contained in:
Alfredo Di Stasio
2026-03-13 14:00:39 +01:00
parent 6aa66807e9
commit eacff3d25e
14 changed files with 844 additions and 16 deletions

View File

@ -31,10 +31,10 @@ POSTGRES_PORT=5432
LOCAL_UID=1000
LOCAL_GID=1000
# Snapshot storage (volume-backed directories)
SNAPSHOT_INCOMING_DIR=/app/snapshots/incoming
SNAPSHOT_ARCHIVE_DIR=/app/snapshots/archive
SNAPSHOT_FAILED_DIR=/app/snapshots/failed
# Static dataset storage (volume-backed directories)
STATIC_DATASET_INCOMING_DIR=/app/snapshots/incoming
STATIC_DATASET_ARCHIVE_DIR=/app/snapshots/archive
STATIC_DATASET_FAILED_DIR=/app/snapshots/failed
# Future optional scheduler loop settings (not enabled in base v2 runtime)
SCHEDULER_ENABLED=0

View File

@ -8,10 +8,9 @@ Current v2 foundation scope in this branch:
- nginx reverse proxy
- management-command-driven runtime operations
- static snapshot directories persisted via Docker named volumes
- strict JSON snapshot schema + import management command
Out of scope in this step:
- domain model redesign
- snapshot importer implementation
- extractor implementation
## Runtime Architecture (v2)
@ -81,7 +80,7 @@ Core groups:
- Django runtime/security vars
- PostgreSQL connection vars
- image tag vars (`APP_IMAGE_TAG`, `NGINX_IMAGE_TAG`)
- snapshot directory vars (`SNAPSHOT_*`)
- snapshot directory vars (`STATIC_DATASET_*`)
- optional future scheduler vars (`SCHEDULER_*`)
## Snapshot Storage Convention
@ -91,7 +90,84 @@ Snapshot files are expected under:
- archive: `/app/snapshots/archive`
- failed: `/app/snapshots/failed`
In this foundation step, directories are created and persisted but no importer/extractor is implemented yet.
Configured via environment:
- `STATIC_DATASET_INCOMING_DIR`
- `STATIC_DATASET_ARCHIVE_DIR`
- `STATIC_DATASET_FAILED_DIR`
## Snapshot JSON Schema (MVP)
Each file must be a JSON object:
```json
{
"source_name": "official_site_feed",
"snapshot_date": "2026-03-13",
"records": [
{
"competition_external_id": "comp-nba",
"competition_name": "NBA",
"season": "2025-2026",
"team_external_id": "team-lal",
"team_name": "Los Angeles Lakers",
"player_external_id": "player-23",
"full_name": "LeBron James",
"first_name": "LeBron",
"last_name": "James",
"birth_date": "1984-12-30",
"nationality": "US",
"height_cm": 206,
"weight_kg": 113,
"position": "SF",
"role": "Primary Creator",
"games_played": 60,
"minutes_per_game": 34.5,
"points_per_game": 25.4,
"rebounds_per_game": 7.2,
"assists_per_game": 8.1,
"steals_per_game": 1.3,
"blocks_per_game": 0.7,
"turnovers_per_game": 3.2,
"fg_pct": 51.1,
"three_pt_pct": 38.4,
"ft_pct": 79.8,
"source_metadata": {},
"raw_payload": {}
}
],
"source_metadata": {},
"raw_payload": {}
}
```
Validation is strict:
- unknown fields are rejected
- required fields must exist
- `snapshot_date` and `birth_date` must be `YYYY-MM-DD`
- numeric fields must be numeric
- invalid files are moved to failed directory
## Import Command
Run import:
```bash
docker compose exec web python manage.py import_snapshots
```
Command behavior:
- scans `STATIC_DATASET_INCOMING_DIR` for `.json` files
- validates strict schema
- computes SHA-256 checksum
- creates `ImportRun` + `ImportFile` records
- upserts relational entities (`Competition`, `Season`, `Team`, `Player`, `PlayerSeason`, `PlayerSeasonStats`)
- skips duplicate content using checksum
- moves valid files to archive
- moves invalid files to failed
Import history is visible in Django admin:
- `ImportRun`
- `ImportFile`
## Migration and Superuser Commands

View File

@ -8,6 +8,8 @@ class ImportFileInline(admin.TabularInline):
extra = 0
readonly_fields = (
"relative_path",
"source_name",
"snapshot_date",
"status",
"checksum",
"file_size_bytes",
@ -61,6 +63,8 @@ class ImportFileAdmin(admin.ModelAdmin):
"id",
"import_run",
"relative_path",
"source_name",
"snapshot_date",
"status",
"rows_total",
"rows_upserted",
@ -68,10 +72,12 @@ class ImportFileAdmin(admin.ModelAdmin):
"processed_at",
)
list_filter = ("status",)
search_fields = ("relative_path", "checksum", "error_message")
search_fields = ("relative_path", "source_name", "checksum", "error_message")
readonly_fields = (
"import_run",
"relative_path",
"source_name",
"snapshot_date",
"status",
"checksum",
"file_size_bytes",

View File

View File

@ -0,0 +1,23 @@
from django.conf import settings
from django.core.management.base import BaseCommand
from apps.ingestion.services.snapshot_import import SnapshotImporter
class Command(BaseCommand):
help = "Import static JSON snapshots from incoming directory into PostgreSQL."
def handle(self, *args, **options):
importer = SnapshotImporter(
incoming_dir=settings.STATIC_DATASET_INCOMING_DIR,
archive_dir=settings.STATIC_DATASET_ARCHIVE_DIR,
failed_dir=settings.STATIC_DATASET_FAILED_DIR,
)
run = importer.run()
self.stdout.write(
self.style.SUCCESS(
f"Import run {run.id} completed: status={run.status} "
f"files={run.files_processed}/{run.files_total} "
f"rows_upserted={run.rows_upserted} rows_failed={run.rows_failed}"
)
)

View File

@ -0,0 +1,27 @@
# Generated by Django 5.2.12 on 2026-03-13 12:59
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('ingestion', '0003_importrun_importfile_and_more'),
]
operations = [
migrations.AddField(
model_name='importfile',
name='snapshot_date',
field=models.DateField(blank=True, null=True),
),
migrations.AddField(
model_name='importfile',
name='source_name',
field=models.CharField(blank=True, max_length=120),
),
migrations.AddIndex(
model_name='importfile',
index=models.Index(fields=['source_name', 'snapshot_date'], name='ingestion_i_source__de6843_idx'),
),
]

View File

@ -57,6 +57,8 @@ class ImportFile(models.Model):
related_name="files",
)
relative_path = models.CharField(max_length=260)
source_name = models.CharField(max_length=120, blank=True)
snapshot_date = models.DateField(blank=True, null=True)
status = models.CharField(max_length=24, choices=FileStatus.choices, default=FileStatus.PENDING)
checksum = models.CharField(max_length=128, blank=True)
file_size_bytes = models.PositiveBigIntegerField(blank=True, null=True)
@ -79,6 +81,7 @@ class ImportFile(models.Model):
indexes = [
models.Index(fields=["import_run", "status"]),
models.Index(fields=["relative_path"]),
models.Index(fields=["source_name", "snapshot_date"]),
models.Index(fields=["processed_at"]),
]

View File

@ -0,0 +1,310 @@
from __future__ import annotations
import hashlib
import json
import shutil
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from pathlib import Path
from typing import Any
from django.db import transaction
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.utils.dateparse import parse_date
from apps.competitions.models import Competition, Season
from apps.ingestion.models import ImportFile, ImportRun
from apps.ingestion.snapshots import SnapshotSchemaValidator, SnapshotValidationError
from apps.players.models import Nationality, Player, Position, Role
from apps.stats.models import PlayerSeason, PlayerSeasonStats
from apps.teams.models import Team
@dataclass
class ImportSummary:
files_total: int = 0
files_processed: int = 0
rows_total: int = 0
rows_upserted: int = 0
rows_failed: int = 0
def _safe_move(src: Path, destination_dir: Path) -> Path:
destination_dir.mkdir(parents=True, exist_ok=True)
candidate = destination_dir / src.name
if candidate.exists():
ts = datetime.utcnow().strftime("%Y%m%d%H%M%S")
candidate = destination_dir / f"{src.stem}-{ts}{src.suffix}"
shutil.move(str(src), str(candidate))
return candidate
def _file_checksum(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def _normalize_decimal(value: float | int | str) -> Decimal:
return Decimal(str(value)).quantize(Decimal("0.01"))
def _parse_season_dates(label: str) -> tuple[date, date]:
if "-" in label:
first = label.split("-", 1)[0]
else:
first = label
year = int(first)
return date(year, 9, 1), date(year + 1, 7, 31)
def _resolve_nationality(value: str | None) -> Nationality | None:
if not value:
return None
token = value.strip()
if not token:
return None
if len(token) == 2:
code = token.upper()
obj, _ = Nationality.objects.get_or_create(
iso2_code=code,
defaults={"name": code},
)
return obj
return Nationality.objects.filter(name__iexact=token).first()
def _position_code(position_value: str) -> str:
token = position_value.strip().upper().replace(" ", "_")
return (token[:10] or "UNK")
def _role_code(role_value: str) -> str:
token = slugify(role_value).replace("-", "_")
return (token[:32] or "unknown")
def _player_season_source_uid(record: dict[str, Any], source_name: str, snapshot_date: date) -> str:
return (
f"{source_name}:{snapshot_date.isoformat()}:"
f"{record['competition_external_id']}:{record['season']}:"
f"{record['team_external_id']}:{record['player_external_id']}"
)
def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: date) -> None:
competition_slug = slugify(record["competition_name"]) or f"competition-{record['competition_external_id']}"
competition, _ = Competition.objects.update_or_create(
source_uid=record["competition_external_id"],
defaults={
"name": record["competition_name"],
"slug": competition_slug,
"competition_type": Competition.CompetitionType.LEAGUE,
"is_active": True,
},
)
start_date, end_date = _parse_season_dates(record["season"])
season, _ = Season.objects.update_or_create(
source_uid=f"season:{record['season']}",
defaults={
"label": record["season"],
"start_date": start_date,
"end_date": end_date,
"is_current": False,
},
)
team_slug = slugify(record["team_name"]) or f"team-{record['team_external_id']}"
team, _ = Team.objects.update_or_create(
source_uid=record["team_external_id"],
defaults={
"name": record["team_name"],
"slug": team_slug,
"short_name": "",
},
)
position, _ = Position.objects.get_or_create(
code=_position_code(record["position"]),
defaults={"name": record["position"]},
)
role = None
if record.get("role"):
role, _ = Role.objects.get_or_create(
code=_role_code(record["role"]),
defaults={"name": record["role"]},
)
player, _ = Player.objects.update_or_create(
source_uid=record["player_external_id"],
defaults={
"first_name": record["first_name"],
"last_name": record["last_name"],
"full_name": record["full_name"],
"birth_date": parse_date(record["birth_date"]),
"nationality": _resolve_nationality(record.get("nationality")),
"nominal_position": position,
"inferred_role": role,
"height_cm": record["height_cm"],
"weight_kg": record["weight_kg"],
"is_active": True,
},
)
player_season, _ = PlayerSeason.objects.update_or_create(
source_uid=_player_season_source_uid(record, source_name=source_name, snapshot_date=snapshot_date),
defaults={
"player": player,
"season": season,
"team": team,
"competition": competition,
"games_played": int(record["games_played"]),
"games_started": 0,
"minutes_played": int(round(float(record["minutes_per_game"]) * int(record["games_played"]))),
},
)
PlayerSeasonStats.objects.update_or_create(
player_season=player_season,
defaults={
"points": _normalize_decimal(record["points_per_game"]),
"rebounds": _normalize_decimal(record["rebounds_per_game"]),
"assists": _normalize_decimal(record["assists_per_game"]),
"steals": _normalize_decimal(record["steals_per_game"]),
"blocks": _normalize_decimal(record["blocks_per_game"]),
"turnovers": _normalize_decimal(record["turnovers_per_game"]),
"fg_pct": _normalize_decimal(record["fg_pct"]),
"three_pct": _normalize_decimal(record["three_pt_pct"]),
"ft_pct": _normalize_decimal(record["ft_pct"]),
},
)
class SnapshotImporter:
def __init__(self, *, incoming_dir: str, archive_dir: str, failed_dir: str):
self.incoming_dir = Path(incoming_dir)
self.archive_dir = Path(archive_dir)
self.failed_dir = Path(failed_dir)
def _list_input_files(self) -> list[Path]:
if not self.incoming_dir.exists():
return []
return sorted(path for path in self.incoming_dir.iterdir() if path.is_file() and path.suffix.lower() == ".json")
def run(self, *, triggered_by=None) -> ImportRun:
run = ImportRun.objects.create(
source="static_snapshot_json",
status=ImportRun.RunStatus.RUNNING,
triggered_by=triggered_by,
started_at=timezone.now(),
context={
"incoming_dir": str(self.incoming_dir),
"archive_dir": str(self.archive_dir),
"failed_dir": str(self.failed_dir),
},
)
summary = ImportSummary()
files = self._list_input_files()
summary.files_total = len(files)
for path in files:
checksum = _file_checksum(path)
file_row = ImportFile.objects.create(
import_run=run,
relative_path=path.name,
status=ImportFile.FileStatus.PROCESSING,
checksum=checksum,
file_size_bytes=path.stat().st_size,
)
# Duplicate file content previously imported successfully.
already_imported = ImportFile.objects.filter(
checksum=checksum,
status=ImportFile.FileStatus.SUCCESS,
).exclude(pk=file_row.pk).exists()
if already_imported:
file_row.status = ImportFile.FileStatus.SKIPPED
file_row.error_message = "Skipped duplicate checksum already imported successfully."
file_row.processed_at = timezone.now()
file_row.save(update_fields=["status", "error_message", "processed_at"])
_safe_move(path, self.archive_dir)
summary.files_processed += 1
continue
try:
payload = json.loads(path.read_text(encoding="utf-8"))
validated = SnapshotSchemaValidator.validate(payload)
file_row.source_name = validated.source_name
file_row.snapshot_date = validated.snapshot_date
file_row.rows_total = len(validated.records)
with transaction.atomic():
for record in validated.records:
_upsert_record(record, source_name=validated.source_name, snapshot_date=validated.snapshot_date)
file_row.status = ImportFile.FileStatus.SUCCESS
file_row.rows_upserted = len(validated.records)
file_row.payload_preview = {
"source_name": validated.source_name,
"snapshot_date": validated.snapshot_date.isoformat(),
"sample_record": validated.records[0],
}
_safe_move(path, self.archive_dir)
except (json.JSONDecodeError, SnapshotValidationError, ValueError) as exc:
file_row.status = ImportFile.FileStatus.FAILED
file_row.error_message = str(exc)
_safe_move(path, self.failed_dir)
except Exception as exc: # noqa: BLE001
file_row.status = ImportFile.FileStatus.FAILED
file_row.error_message = f"Unhandled import error: {exc}"
_safe_move(path, self.failed_dir)
file_row.processed_at = timezone.now()
file_row.save(
update_fields=[
"source_name",
"snapshot_date",
"status",
"rows_total",
"rows_upserted",
"rows_failed",
"error_message",
"payload_preview",
"processed_at",
]
)
summary.files_processed += 1
summary.rows_total += file_row.rows_total
summary.rows_upserted += file_row.rows_upserted
summary.rows_failed += file_row.rows_failed + (1 if file_row.status == ImportFile.FileStatus.FAILED else 0)
run.status = ImportRun.RunStatus.SUCCESS if summary.rows_failed == 0 else ImportRun.RunStatus.FAILED
run.files_total = summary.files_total
run.files_processed = summary.files_processed
run.rows_total = summary.rows_total
run.rows_upserted = summary.rows_upserted
run.rows_failed = summary.rows_failed
run.finished_at = timezone.now()
if summary.rows_failed:
run.error_summary = f"{summary.rows_failed} file/row import error(s)."
run.save(
update_fields=[
"status",
"files_total",
"files_processed",
"rows_total",
"rows_upserted",
"rows_failed",
"error_summary",
"finished_at",
]
)
return run

View File

@ -0,0 +1,3 @@
from .schema import SnapshotSchemaValidator, SnapshotValidationError, SnapshotValidationResult
__all__ = ["SnapshotSchemaValidator", "SnapshotValidationError", "SnapshotValidationResult"]

View File

@ -0,0 +1,182 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import date
from typing import Any
from django.utils.dateparse import parse_date
REQUIRED_RECORD_FIELDS = {
"competition_external_id",
"competition_name",
"season",
"team_external_id",
"team_name",
"player_external_id",
"full_name",
"first_name",
"last_name",
"birth_date",
"nationality",
"height_cm",
"weight_kg",
"position",
"games_played",
"minutes_per_game",
"points_per_game",
"rebounds_per_game",
"assists_per_game",
"steals_per_game",
"blocks_per_game",
"turnovers_per_game",
"fg_pct",
"three_pt_pct",
"ft_pct",
}
ALLOWED_TOP_LEVEL_FIELDS = {
"source_name",
"snapshot_date",
"records",
"source_metadata",
"raw_payload",
}
ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | {
"role",
"source_metadata",
"raw_payload",
}
@dataclass
class SnapshotValidationResult:
source_name: str
snapshot_date: date
records: list[dict[str, Any]]
class SnapshotValidationError(ValueError):
pass
class SnapshotSchemaValidator:
"""Strict JSON schema validator for HoopScout v2 player-season snapshots."""
@staticmethod
def _require_string(value: Any, field: str) -> str:
if not isinstance(value, str) or not value.strip():
raise SnapshotValidationError(f"{field} must be a non-empty string")
return value.strip()
@staticmethod
def _require_non_negative_int(value: Any, field: str) -> int:
if isinstance(value, bool):
raise SnapshotValidationError(f"{field} must be a non-negative integer")
try:
parsed = int(value)
except (TypeError, ValueError) as exc:
raise SnapshotValidationError(f"{field} must be a non-negative integer") from exc
if parsed < 0:
raise SnapshotValidationError(f"{field} must be a non-negative integer")
return parsed
@staticmethod
def _require_float(value: Any, field: str) -> float:
try:
parsed = float(value)
except (TypeError, ValueError) as exc:
raise SnapshotValidationError(f"{field} must be numeric") from exc
return parsed
@classmethod
def _validate_record(cls, record: dict[str, Any], index: int) -> dict[str, Any]:
unknown = set(record.keys()) - ALLOWED_RECORD_FIELDS
if unknown:
raise SnapshotValidationError(
f"record[{index}] contains unknown fields: {', '.join(sorted(unknown))}"
)
missing = REQUIRED_RECORD_FIELDS - set(record.keys())
if missing:
raise SnapshotValidationError(
f"record[{index}] missing required fields: {', '.join(sorted(missing))}"
)
normalized = dict(record)
for field in (
"competition_external_id",
"competition_name",
"season",
"team_external_id",
"team_name",
"player_external_id",
"full_name",
"first_name",
"last_name",
"nationality",
"position",
):
normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}")
if record.get("role") is not None:
normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role")
birth_date = parse_date(str(record.get("birth_date")))
if not birth_date:
raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD")
normalized["birth_date"] = birth_date.isoformat()
normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm")
normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg")
normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played")
for field in (
"minutes_per_game",
"points_per_game",
"rebounds_per_game",
"assists_per_game",
"steals_per_game",
"blocks_per_game",
"turnovers_per_game",
"fg_pct",
"three_pt_pct",
"ft_pct",
):
normalized[field] = cls._require_float(record.get(field), f"record[{index}].{field}")
return normalized
@classmethod
def validate(cls, payload: dict[str, Any]) -> SnapshotValidationResult:
if not isinstance(payload, dict):
raise SnapshotValidationError("Snapshot root must be an object")
unknown = set(payload.keys()) - ALLOWED_TOP_LEVEL_FIELDS
if unknown:
raise SnapshotValidationError(
f"Snapshot contains unknown top-level fields: {', '.join(sorted(unknown))}"
)
source_name = cls._require_string(payload.get("source_name"), "source_name")
snapshot_date_raw = payload.get("snapshot_date")
snapshot_date = parse_date(str(snapshot_date_raw))
if not snapshot_date:
raise SnapshotValidationError("snapshot_date must be YYYY-MM-DD")
records = payload.get("records")
if not isinstance(records, list) or not records:
raise SnapshotValidationError("records must be a non-empty array")
normalized_records: list[dict[str, Any]] = []
for index, record in enumerate(records):
if not isinstance(record, dict):
raise SnapshotValidationError(f"record[{index}] must be an object")
normalized_records.append(cls._validate_record(record, index=index))
return SnapshotValidationResult(
source_name=source_name,
snapshot_date=snapshot_date,
records=normalized_records,
)

View File

@ -142,10 +142,19 @@ LOGIN_URL = "users:login"
LOGIN_REDIRECT_URL = "core:dashboard"
LOGOUT_REDIRECT_URL = "core:home"
# HoopScout v2 snapshot storage (volume-backed directories).
SNAPSHOT_INCOMING_DIR = os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming"))
SNAPSHOT_ARCHIVE_DIR = os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive"))
SNAPSHOT_FAILED_DIR = os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed"))
# HoopScout v2 static dataset storage (volume-backed directories).
STATIC_DATASET_INCOMING_DIR = os.getenv(
"STATIC_DATASET_INCOMING_DIR",
os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming")),
)
STATIC_DATASET_ARCHIVE_DIR = os.getenv(
"STATIC_DATASET_ARCHIVE_DIR",
os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive")),
)
STATIC_DATASET_FAILED_DIR = os.getenv(
"STATIC_DATASET_FAILED_DIR",
os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")),
)
# Optional scheduler command settings for future v2 snapshot jobs.
SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False)

View File

@ -8,9 +8,9 @@ done
echo "PostgreSQL is available."
mkdir -p "${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}" \
"${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}" \
"${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}"
mkdir -p "${STATIC_DATASET_INCOMING_DIR:-${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}}" \
"${STATIC_DATASET_ARCHIVE_DIR:-${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}}" \
"${STATIC_DATASET_FAILED_DIR:-${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}}"
if [ "${DJANGO_SETTINGS_MODULE:-}" = "config.settings.production" ] && [ "$1" = "gunicorn" ]; then
echo "Running Django deployment checks..."

View File

@ -0,0 +1,189 @@
from __future__ import annotations
import json
from datetime import date
from pathlib import Path
import pytest
from django.core.management import call_command
from apps.competitions.models import Competition, Season
from apps.ingestion.models import ImportFile, ImportRun
from apps.players.models import Player
from apps.stats.models import PlayerSeason, PlayerSeasonStats
from apps.teams.models import Team
def _valid_payload() -> dict:
return {
"source_name": "official_site_feed",
"snapshot_date": "2026-03-13",
"records": [
{
"competition_external_id": "comp-nba",
"competition_name": "NBA",
"season": "2025-2026",
"team_external_id": "team-lal",
"team_name": "Los Angeles Lakers",
"player_external_id": "player-23",
"full_name": "LeBron James",
"first_name": "LeBron",
"last_name": "James",
"birth_date": "1984-12-30",
"nationality": "US",
"height_cm": 206,
"weight_kg": 113,
"position": "SF",
"role": "Primary Creator",
"games_played": 60,
"minutes_per_game": 34.5,
"points_per_game": 25.4,
"rebounds_per_game": 7.2,
"assists_per_game": 8.1,
"steals_per_game": 1.3,
"blocks_per_game": 0.7,
"turnovers_per_game": 3.2,
"fg_pct": 51.1,
"three_pt_pct": 38.4,
"ft_pct": 79.8,
}
],
}
def _write_json(path: Path, payload: dict) -> None:
path.write_text(json.dumps(payload), encoding="utf-8")
@pytest.mark.django_db
def test_valid_snapshot_import(tmp_path, settings):
incoming = tmp_path / "incoming"
archive = tmp_path / "archive"
failed = tmp_path / "failed"
incoming.mkdir()
archive.mkdir()
failed.mkdir()
payload = _valid_payload()
file_path = incoming / "nba-2026-03-13.json"
_write_json(file_path, payload)
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
settings.STATIC_DATASET_FAILED_DIR = str(failed)
call_command("import_snapshots")
run = ImportRun.objects.get()
assert run.status == ImportRun.RunStatus.SUCCESS
assert run.files_processed == 1
assert run.rows_upserted == 1
import_file = ImportFile.objects.get(import_run=run)
assert import_file.status == ImportFile.FileStatus.SUCCESS
assert import_file.source_name == "official_site_feed"
assert import_file.snapshot_date == date(2026, 3, 13)
assert (archive / "nba-2026-03-13.json").exists()
assert not (incoming / "nba-2026-03-13.json").exists()
assert Competition.objects.filter(source_uid="comp-nba").exists()
assert Team.objects.filter(source_uid="team-lal").exists()
assert Player.objects.filter(source_uid="player-23").exists()
assert Season.objects.filter(source_uid="season:2025-2026").exists()
assert PlayerSeason.objects.count() == 1
assert PlayerSeasonStats.objects.count() == 1
@pytest.mark.django_db
def test_invalid_snapshot_rejected_and_moved_to_failed(tmp_path, settings):
incoming = tmp_path / "incoming"
archive = tmp_path / "archive"
failed = tmp_path / "failed"
incoming.mkdir()
archive.mkdir()
failed.mkdir()
payload = _valid_payload()
del payload["records"][0]["points_per_game"]
file_path = incoming / "broken.json"
_write_json(file_path, payload)
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
settings.STATIC_DATASET_FAILED_DIR = str(failed)
call_command("import_snapshots")
run = ImportRun.objects.get()
assert run.status == ImportRun.RunStatus.FAILED
import_file = ImportFile.objects.get(import_run=run)
assert import_file.status == ImportFile.FileStatus.FAILED
assert "missing required fields" in import_file.error_message
assert (failed / "broken.json").exists()
assert not (archive / "broken.json").exists()
assert not Competition.objects.exists()
@pytest.mark.django_db
def test_idempotent_reimport_uses_checksum_and_skips_duplicate(tmp_path, settings):
incoming = tmp_path / "incoming"
archive = tmp_path / "archive"
failed = tmp_path / "failed"
incoming.mkdir()
archive.mkdir()
failed.mkdir()
payload = _valid_payload()
_write_json(incoming / "first.json", payload)
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
settings.STATIC_DATASET_FAILED_DIR = str(failed)
call_command("import_snapshots")
assert Competition.objects.count() == 1
assert Player.objects.count() == 1
# Re-drop same content with different filename.
_write_json(incoming / "first-duplicate.json", payload)
call_command("import_snapshots")
assert Competition.objects.count() == 1
assert Player.objects.count() == 1
assert PlayerSeason.objects.count() == 1
duplicate_file = ImportFile.objects.filter(relative_path="first-duplicate.json").order_by("-id").first()
assert duplicate_file is not None
assert duplicate_file.status == ImportFile.FileStatus.SKIPPED
assert duplicate_file.checksum
assert "duplicate checksum" in duplicate_file.error_message.lower()
assert (archive / "first-duplicate.json").exists()
@pytest.mark.django_db
def test_same_run_second_file_same_checksum_is_skipped(tmp_path, settings):
incoming = tmp_path / "incoming"
archive = tmp_path / "archive"
failed = tmp_path / "failed"
incoming.mkdir()
archive.mkdir()
failed.mkdir()
payload = _valid_payload()
_write_json(incoming / "a.json", payload)
_write_json(incoming / "b.json", payload)
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
settings.STATIC_DATASET_FAILED_DIR = str(failed)
call_command("import_snapshots")
files = {row.relative_path: row for row in ImportFile.objects.order_by("relative_path")}
assert files["a.json"].status == ImportFile.FileStatus.SUCCESS
assert files["b.json"].status == ImportFile.FileStatus.SKIPPED
assert files["a.json"].checksum == files["b.json"].checksum