Add v2 snapshot schema validation and import_snapshots command
This commit is contained in:
@ -31,10 +31,10 @@ POSTGRES_PORT=5432
|
||||
LOCAL_UID=1000
|
||||
LOCAL_GID=1000
|
||||
|
||||
# Snapshot storage (volume-backed directories)
|
||||
SNAPSHOT_INCOMING_DIR=/app/snapshots/incoming
|
||||
SNAPSHOT_ARCHIVE_DIR=/app/snapshots/archive
|
||||
SNAPSHOT_FAILED_DIR=/app/snapshots/failed
|
||||
# Static dataset storage (volume-backed directories)
|
||||
STATIC_DATASET_INCOMING_DIR=/app/snapshots/incoming
|
||||
STATIC_DATASET_ARCHIVE_DIR=/app/snapshots/archive
|
||||
STATIC_DATASET_FAILED_DIR=/app/snapshots/failed
|
||||
|
||||
# Future optional scheduler loop settings (not enabled in base v2 runtime)
|
||||
SCHEDULER_ENABLED=0
|
||||
|
||||
84
README.md
84
README.md
@ -8,10 +8,9 @@ Current v2 foundation scope in this branch:
|
||||
- nginx reverse proxy
|
||||
- management-command-driven runtime operations
|
||||
- static snapshot directories persisted via Docker named volumes
|
||||
- strict JSON snapshot schema + import management command
|
||||
|
||||
Out of scope in this step:
|
||||
- domain model redesign
|
||||
- snapshot importer implementation
|
||||
- extractor implementation
|
||||
|
||||
## Runtime Architecture (v2)
|
||||
@ -81,7 +80,7 @@ Core groups:
|
||||
- Django runtime/security vars
|
||||
- PostgreSQL connection vars
|
||||
- image tag vars (`APP_IMAGE_TAG`, `NGINX_IMAGE_TAG`)
|
||||
- snapshot directory vars (`SNAPSHOT_*`)
|
||||
- snapshot directory vars (`STATIC_DATASET_*`)
|
||||
- optional future scheduler vars (`SCHEDULER_*`)
|
||||
|
||||
## Snapshot Storage Convention
|
||||
@ -91,7 +90,84 @@ Snapshot files are expected under:
|
||||
- archive: `/app/snapshots/archive`
|
||||
- failed: `/app/snapshots/failed`
|
||||
|
||||
In this foundation step, directories are created and persisted but no importer/extractor is implemented yet.
|
||||
Configured via environment:
|
||||
- `STATIC_DATASET_INCOMING_DIR`
|
||||
- `STATIC_DATASET_ARCHIVE_DIR`
|
||||
- `STATIC_DATASET_FAILED_DIR`
|
||||
|
||||
## Snapshot JSON Schema (MVP)
|
||||
|
||||
Each file must be a JSON object:
|
||||
|
||||
```json
|
||||
{
|
||||
"source_name": "official_site_feed",
|
||||
"snapshot_date": "2026-03-13",
|
||||
"records": [
|
||||
{
|
||||
"competition_external_id": "comp-nba",
|
||||
"competition_name": "NBA",
|
||||
"season": "2025-2026",
|
||||
"team_external_id": "team-lal",
|
||||
"team_name": "Los Angeles Lakers",
|
||||
"player_external_id": "player-23",
|
||||
"full_name": "LeBron James",
|
||||
"first_name": "LeBron",
|
||||
"last_name": "James",
|
||||
"birth_date": "1984-12-30",
|
||||
"nationality": "US",
|
||||
"height_cm": 206,
|
||||
"weight_kg": 113,
|
||||
"position": "SF",
|
||||
"role": "Primary Creator",
|
||||
"games_played": 60,
|
||||
"minutes_per_game": 34.5,
|
||||
"points_per_game": 25.4,
|
||||
"rebounds_per_game": 7.2,
|
||||
"assists_per_game": 8.1,
|
||||
"steals_per_game": 1.3,
|
||||
"blocks_per_game": 0.7,
|
||||
"turnovers_per_game": 3.2,
|
||||
"fg_pct": 51.1,
|
||||
"three_pt_pct": 38.4,
|
||||
"ft_pct": 79.8,
|
||||
"source_metadata": {},
|
||||
"raw_payload": {}
|
||||
}
|
||||
],
|
||||
"source_metadata": {},
|
||||
"raw_payload": {}
|
||||
}
|
||||
```
|
||||
|
||||
Validation is strict:
|
||||
- unknown fields are rejected
|
||||
- required fields must exist
|
||||
- `snapshot_date` and `birth_date` must be `YYYY-MM-DD`
|
||||
- numeric fields must be numeric
|
||||
- invalid files are moved to failed directory
|
||||
|
||||
## Import Command
|
||||
|
||||
Run import:
|
||||
|
||||
```bash
|
||||
docker compose exec web python manage.py import_snapshots
|
||||
```
|
||||
|
||||
Command behavior:
|
||||
- scans `STATIC_DATASET_INCOMING_DIR` for `.json` files
|
||||
- validates strict schema
|
||||
- computes SHA-256 checksum
|
||||
- creates `ImportRun` + `ImportFile` records
|
||||
- upserts relational entities (`Competition`, `Season`, `Team`, `Player`, `PlayerSeason`, `PlayerSeasonStats`)
|
||||
- skips duplicate content using checksum
|
||||
- moves valid files to archive
|
||||
- moves invalid files to failed
|
||||
|
||||
Import history is visible in Django admin:
|
||||
- `ImportRun`
|
||||
- `ImportFile`
|
||||
|
||||
## Migration and Superuser Commands
|
||||
|
||||
|
||||
@ -8,6 +8,8 @@ class ImportFileInline(admin.TabularInline):
|
||||
extra = 0
|
||||
readonly_fields = (
|
||||
"relative_path",
|
||||
"source_name",
|
||||
"snapshot_date",
|
||||
"status",
|
||||
"checksum",
|
||||
"file_size_bytes",
|
||||
@ -61,6 +63,8 @@ class ImportFileAdmin(admin.ModelAdmin):
|
||||
"id",
|
||||
"import_run",
|
||||
"relative_path",
|
||||
"source_name",
|
||||
"snapshot_date",
|
||||
"status",
|
||||
"rows_total",
|
||||
"rows_upserted",
|
||||
@ -68,10 +72,12 @@ class ImportFileAdmin(admin.ModelAdmin):
|
||||
"processed_at",
|
||||
)
|
||||
list_filter = ("status",)
|
||||
search_fields = ("relative_path", "checksum", "error_message")
|
||||
search_fields = ("relative_path", "source_name", "checksum", "error_message")
|
||||
readonly_fields = (
|
||||
"import_run",
|
||||
"relative_path",
|
||||
"source_name",
|
||||
"snapshot_date",
|
||||
"status",
|
||||
"checksum",
|
||||
"file_size_bytes",
|
||||
|
||||
0
apps/ingestion/management/__init__.py
Normal file
0
apps/ingestion/management/__init__.py
Normal file
0
apps/ingestion/management/commands/__init__.py
Normal file
0
apps/ingestion/management/commands/__init__.py
Normal file
23
apps/ingestion/management/commands/import_snapshots.py
Normal file
23
apps/ingestion/management/commands/import_snapshots.py
Normal file
@ -0,0 +1,23 @@
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from apps.ingestion.services.snapshot_import import SnapshotImporter
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Import static JSON snapshots from incoming directory into PostgreSQL."
|
||||
|
||||
def handle(self, *args, **options):
|
||||
importer = SnapshotImporter(
|
||||
incoming_dir=settings.STATIC_DATASET_INCOMING_DIR,
|
||||
archive_dir=settings.STATIC_DATASET_ARCHIVE_DIR,
|
||||
failed_dir=settings.STATIC_DATASET_FAILED_DIR,
|
||||
)
|
||||
run = importer.run()
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(
|
||||
f"Import run {run.id} completed: status={run.status} "
|
||||
f"files={run.files_processed}/{run.files_total} "
|
||||
f"rows_upserted={run.rows_upserted} rows_failed={run.rows_failed}"
|
||||
)
|
||||
)
|
||||
@ -0,0 +1,27 @@
|
||||
# Generated by Django 5.2.12 on 2026-03-13 12:59
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('ingestion', '0003_importrun_importfile_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='importfile',
|
||||
name='snapshot_date',
|
||||
field=models.DateField(blank=True, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='importfile',
|
||||
name='source_name',
|
||||
field=models.CharField(blank=True, max_length=120),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='importfile',
|
||||
index=models.Index(fields=['source_name', 'snapshot_date'], name='ingestion_i_source__de6843_idx'),
|
||||
),
|
||||
]
|
||||
@ -57,6 +57,8 @@ class ImportFile(models.Model):
|
||||
related_name="files",
|
||||
)
|
||||
relative_path = models.CharField(max_length=260)
|
||||
source_name = models.CharField(max_length=120, blank=True)
|
||||
snapshot_date = models.DateField(blank=True, null=True)
|
||||
status = models.CharField(max_length=24, choices=FileStatus.choices, default=FileStatus.PENDING)
|
||||
checksum = models.CharField(max_length=128, blank=True)
|
||||
file_size_bytes = models.PositiveBigIntegerField(blank=True, null=True)
|
||||
@ -79,6 +81,7 @@ class ImportFile(models.Model):
|
||||
indexes = [
|
||||
models.Index(fields=["import_run", "status"]),
|
||||
models.Index(fields=["relative_path"]),
|
||||
models.Index(fields=["source_name", "snapshot_date"]),
|
||||
models.Index(fields=["processed_at"]),
|
||||
]
|
||||
|
||||
|
||||
310
apps/ingestion/services/snapshot_import.py
Normal file
310
apps/ingestion/services/snapshot_import.py
Normal file
@ -0,0 +1,310 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import shutil
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from django.db import transaction
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from django.utils.dateparse import parse_date
|
||||
|
||||
from apps.competitions.models import Competition, Season
|
||||
from apps.ingestion.models import ImportFile, ImportRun
|
||||
from apps.ingestion.snapshots import SnapshotSchemaValidator, SnapshotValidationError
|
||||
from apps.players.models import Nationality, Player, Position, Role
|
||||
from apps.stats.models import PlayerSeason, PlayerSeasonStats
|
||||
from apps.teams.models import Team
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImportSummary:
|
||||
files_total: int = 0
|
||||
files_processed: int = 0
|
||||
rows_total: int = 0
|
||||
rows_upserted: int = 0
|
||||
rows_failed: int = 0
|
||||
|
||||
|
||||
def _safe_move(src: Path, destination_dir: Path) -> Path:
|
||||
destination_dir.mkdir(parents=True, exist_ok=True)
|
||||
candidate = destination_dir / src.name
|
||||
if candidate.exists():
|
||||
ts = datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
||||
candidate = destination_dir / f"{src.stem}-{ts}{src.suffix}"
|
||||
shutil.move(str(src), str(candidate))
|
||||
return candidate
|
||||
|
||||
|
||||
def _file_checksum(path: Path) -> str:
|
||||
digest = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def _normalize_decimal(value: float | int | str) -> Decimal:
|
||||
return Decimal(str(value)).quantize(Decimal("0.01"))
|
||||
|
||||
|
||||
def _parse_season_dates(label: str) -> tuple[date, date]:
|
||||
if "-" in label:
|
||||
first = label.split("-", 1)[0]
|
||||
else:
|
||||
first = label
|
||||
year = int(first)
|
||||
return date(year, 9, 1), date(year + 1, 7, 31)
|
||||
|
||||
|
||||
def _resolve_nationality(value: str | None) -> Nationality | None:
|
||||
if not value:
|
||||
return None
|
||||
token = value.strip()
|
||||
if not token:
|
||||
return None
|
||||
if len(token) == 2:
|
||||
code = token.upper()
|
||||
obj, _ = Nationality.objects.get_or_create(
|
||||
iso2_code=code,
|
||||
defaults={"name": code},
|
||||
)
|
||||
return obj
|
||||
return Nationality.objects.filter(name__iexact=token).first()
|
||||
|
||||
|
||||
def _position_code(position_value: str) -> str:
|
||||
token = position_value.strip().upper().replace(" ", "_")
|
||||
return (token[:10] or "UNK")
|
||||
|
||||
|
||||
def _role_code(role_value: str) -> str:
|
||||
token = slugify(role_value).replace("-", "_")
|
||||
return (token[:32] or "unknown")
|
||||
|
||||
|
||||
def _player_season_source_uid(record: dict[str, Any], source_name: str, snapshot_date: date) -> str:
|
||||
return (
|
||||
f"{source_name}:{snapshot_date.isoformat()}:"
|
||||
f"{record['competition_external_id']}:{record['season']}:"
|
||||
f"{record['team_external_id']}:{record['player_external_id']}"
|
||||
)
|
||||
|
||||
|
||||
def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: date) -> None:
|
||||
competition_slug = slugify(record["competition_name"]) or f"competition-{record['competition_external_id']}"
|
||||
competition, _ = Competition.objects.update_or_create(
|
||||
source_uid=record["competition_external_id"],
|
||||
defaults={
|
||||
"name": record["competition_name"],
|
||||
"slug": competition_slug,
|
||||
"competition_type": Competition.CompetitionType.LEAGUE,
|
||||
"is_active": True,
|
||||
},
|
||||
)
|
||||
|
||||
start_date, end_date = _parse_season_dates(record["season"])
|
||||
season, _ = Season.objects.update_or_create(
|
||||
source_uid=f"season:{record['season']}",
|
||||
defaults={
|
||||
"label": record["season"],
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"is_current": False,
|
||||
},
|
||||
)
|
||||
|
||||
team_slug = slugify(record["team_name"]) or f"team-{record['team_external_id']}"
|
||||
team, _ = Team.objects.update_or_create(
|
||||
source_uid=record["team_external_id"],
|
||||
defaults={
|
||||
"name": record["team_name"],
|
||||
"slug": team_slug,
|
||||
"short_name": "",
|
||||
},
|
||||
)
|
||||
|
||||
position, _ = Position.objects.get_or_create(
|
||||
code=_position_code(record["position"]),
|
||||
defaults={"name": record["position"]},
|
||||
)
|
||||
role = None
|
||||
if record.get("role"):
|
||||
role, _ = Role.objects.get_or_create(
|
||||
code=_role_code(record["role"]),
|
||||
defaults={"name": record["role"]},
|
||||
)
|
||||
|
||||
player, _ = Player.objects.update_or_create(
|
||||
source_uid=record["player_external_id"],
|
||||
defaults={
|
||||
"first_name": record["first_name"],
|
||||
"last_name": record["last_name"],
|
||||
"full_name": record["full_name"],
|
||||
"birth_date": parse_date(record["birth_date"]),
|
||||
"nationality": _resolve_nationality(record.get("nationality")),
|
||||
"nominal_position": position,
|
||||
"inferred_role": role,
|
||||
"height_cm": record["height_cm"],
|
||||
"weight_kg": record["weight_kg"],
|
||||
"is_active": True,
|
||||
},
|
||||
)
|
||||
|
||||
player_season, _ = PlayerSeason.objects.update_or_create(
|
||||
source_uid=_player_season_source_uid(record, source_name=source_name, snapshot_date=snapshot_date),
|
||||
defaults={
|
||||
"player": player,
|
||||
"season": season,
|
||||
"team": team,
|
||||
"competition": competition,
|
||||
"games_played": int(record["games_played"]),
|
||||
"games_started": 0,
|
||||
"minutes_played": int(round(float(record["minutes_per_game"]) * int(record["games_played"]))),
|
||||
},
|
||||
)
|
||||
|
||||
PlayerSeasonStats.objects.update_or_create(
|
||||
player_season=player_season,
|
||||
defaults={
|
||||
"points": _normalize_decimal(record["points_per_game"]),
|
||||
"rebounds": _normalize_decimal(record["rebounds_per_game"]),
|
||||
"assists": _normalize_decimal(record["assists_per_game"]),
|
||||
"steals": _normalize_decimal(record["steals_per_game"]),
|
||||
"blocks": _normalize_decimal(record["blocks_per_game"]),
|
||||
"turnovers": _normalize_decimal(record["turnovers_per_game"]),
|
||||
"fg_pct": _normalize_decimal(record["fg_pct"]),
|
||||
"three_pct": _normalize_decimal(record["three_pt_pct"]),
|
||||
"ft_pct": _normalize_decimal(record["ft_pct"]),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class SnapshotImporter:
|
||||
def __init__(self, *, incoming_dir: str, archive_dir: str, failed_dir: str):
|
||||
self.incoming_dir = Path(incoming_dir)
|
||||
self.archive_dir = Path(archive_dir)
|
||||
self.failed_dir = Path(failed_dir)
|
||||
|
||||
def _list_input_files(self) -> list[Path]:
|
||||
if not self.incoming_dir.exists():
|
||||
return []
|
||||
return sorted(path for path in self.incoming_dir.iterdir() if path.is_file() and path.suffix.lower() == ".json")
|
||||
|
||||
def run(self, *, triggered_by=None) -> ImportRun:
|
||||
run = ImportRun.objects.create(
|
||||
source="static_snapshot_json",
|
||||
status=ImportRun.RunStatus.RUNNING,
|
||||
triggered_by=triggered_by,
|
||||
started_at=timezone.now(),
|
||||
context={
|
||||
"incoming_dir": str(self.incoming_dir),
|
||||
"archive_dir": str(self.archive_dir),
|
||||
"failed_dir": str(self.failed_dir),
|
||||
},
|
||||
)
|
||||
|
||||
summary = ImportSummary()
|
||||
files = self._list_input_files()
|
||||
summary.files_total = len(files)
|
||||
|
||||
for path in files:
|
||||
checksum = _file_checksum(path)
|
||||
file_row = ImportFile.objects.create(
|
||||
import_run=run,
|
||||
relative_path=path.name,
|
||||
status=ImportFile.FileStatus.PROCESSING,
|
||||
checksum=checksum,
|
||||
file_size_bytes=path.stat().st_size,
|
||||
)
|
||||
|
||||
# Duplicate file content previously imported successfully.
|
||||
already_imported = ImportFile.objects.filter(
|
||||
checksum=checksum,
|
||||
status=ImportFile.FileStatus.SUCCESS,
|
||||
).exclude(pk=file_row.pk).exists()
|
||||
if already_imported:
|
||||
file_row.status = ImportFile.FileStatus.SKIPPED
|
||||
file_row.error_message = "Skipped duplicate checksum already imported successfully."
|
||||
file_row.processed_at = timezone.now()
|
||||
file_row.save(update_fields=["status", "error_message", "processed_at"])
|
||||
_safe_move(path, self.archive_dir)
|
||||
summary.files_processed += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
validated = SnapshotSchemaValidator.validate(payload)
|
||||
|
||||
file_row.source_name = validated.source_name
|
||||
file_row.snapshot_date = validated.snapshot_date
|
||||
file_row.rows_total = len(validated.records)
|
||||
|
||||
with transaction.atomic():
|
||||
for record in validated.records:
|
||||
_upsert_record(record, source_name=validated.source_name, snapshot_date=validated.snapshot_date)
|
||||
|
||||
file_row.status = ImportFile.FileStatus.SUCCESS
|
||||
file_row.rows_upserted = len(validated.records)
|
||||
file_row.payload_preview = {
|
||||
"source_name": validated.source_name,
|
||||
"snapshot_date": validated.snapshot_date.isoformat(),
|
||||
"sample_record": validated.records[0],
|
||||
}
|
||||
_safe_move(path, self.archive_dir)
|
||||
except (json.JSONDecodeError, SnapshotValidationError, ValueError) as exc:
|
||||
file_row.status = ImportFile.FileStatus.FAILED
|
||||
file_row.error_message = str(exc)
|
||||
_safe_move(path, self.failed_dir)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
file_row.status = ImportFile.FileStatus.FAILED
|
||||
file_row.error_message = f"Unhandled import error: {exc}"
|
||||
_safe_move(path, self.failed_dir)
|
||||
|
||||
file_row.processed_at = timezone.now()
|
||||
file_row.save(
|
||||
update_fields=[
|
||||
"source_name",
|
||||
"snapshot_date",
|
||||
"status",
|
||||
"rows_total",
|
||||
"rows_upserted",
|
||||
"rows_failed",
|
||||
"error_message",
|
||||
"payload_preview",
|
||||
"processed_at",
|
||||
]
|
||||
)
|
||||
|
||||
summary.files_processed += 1
|
||||
summary.rows_total += file_row.rows_total
|
||||
summary.rows_upserted += file_row.rows_upserted
|
||||
summary.rows_failed += file_row.rows_failed + (1 if file_row.status == ImportFile.FileStatus.FAILED else 0)
|
||||
|
||||
run.status = ImportRun.RunStatus.SUCCESS if summary.rows_failed == 0 else ImportRun.RunStatus.FAILED
|
||||
run.files_total = summary.files_total
|
||||
run.files_processed = summary.files_processed
|
||||
run.rows_total = summary.rows_total
|
||||
run.rows_upserted = summary.rows_upserted
|
||||
run.rows_failed = summary.rows_failed
|
||||
run.finished_at = timezone.now()
|
||||
if summary.rows_failed:
|
||||
run.error_summary = f"{summary.rows_failed} file/row import error(s)."
|
||||
run.save(
|
||||
update_fields=[
|
||||
"status",
|
||||
"files_total",
|
||||
"files_processed",
|
||||
"rows_total",
|
||||
"rows_upserted",
|
||||
"rows_failed",
|
||||
"error_summary",
|
||||
"finished_at",
|
||||
]
|
||||
)
|
||||
return run
|
||||
3
apps/ingestion/snapshots/__init__.py
Normal file
3
apps/ingestion/snapshots/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .schema import SnapshotSchemaValidator, SnapshotValidationError, SnapshotValidationResult
|
||||
|
||||
__all__ = ["SnapshotSchemaValidator", "SnapshotValidationError", "SnapshotValidationResult"]
|
||||
182
apps/ingestion/snapshots/schema.py
Normal file
182
apps/ingestion/snapshots/schema.py
Normal file
@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Any
|
||||
|
||||
from django.utils.dateparse import parse_date
|
||||
|
||||
REQUIRED_RECORD_FIELDS = {
|
||||
"competition_external_id",
|
||||
"competition_name",
|
||||
"season",
|
||||
"team_external_id",
|
||||
"team_name",
|
||||
"player_external_id",
|
||||
"full_name",
|
||||
"first_name",
|
||||
"last_name",
|
||||
"birth_date",
|
||||
"nationality",
|
||||
"height_cm",
|
||||
"weight_kg",
|
||||
"position",
|
||||
"games_played",
|
||||
"minutes_per_game",
|
||||
"points_per_game",
|
||||
"rebounds_per_game",
|
||||
"assists_per_game",
|
||||
"steals_per_game",
|
||||
"blocks_per_game",
|
||||
"turnovers_per_game",
|
||||
"fg_pct",
|
||||
"three_pt_pct",
|
||||
"ft_pct",
|
||||
}
|
||||
|
||||
ALLOWED_TOP_LEVEL_FIELDS = {
|
||||
"source_name",
|
||||
"snapshot_date",
|
||||
"records",
|
||||
"source_metadata",
|
||||
"raw_payload",
|
||||
}
|
||||
|
||||
ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | {
|
||||
"role",
|
||||
"source_metadata",
|
||||
"raw_payload",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapshotValidationResult:
|
||||
source_name: str
|
||||
snapshot_date: date
|
||||
records: list[dict[str, Any]]
|
||||
|
||||
|
||||
class SnapshotValidationError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class SnapshotSchemaValidator:
|
||||
"""Strict JSON schema validator for HoopScout v2 player-season snapshots."""
|
||||
|
||||
@staticmethod
|
||||
def _require_string(value: Any, field: str) -> str:
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
raise SnapshotValidationError(f"{field} must be a non-empty string")
|
||||
return value.strip()
|
||||
|
||||
@staticmethod
|
||||
def _require_non_negative_int(value: Any, field: str) -> int:
|
||||
if isinstance(value, bool):
|
||||
raise SnapshotValidationError(f"{field} must be a non-negative integer")
|
||||
try:
|
||||
parsed = int(value)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise SnapshotValidationError(f"{field} must be a non-negative integer") from exc
|
||||
if parsed < 0:
|
||||
raise SnapshotValidationError(f"{field} must be a non-negative integer")
|
||||
return parsed
|
||||
|
||||
@staticmethod
|
||||
def _require_float(value: Any, field: str) -> float:
|
||||
try:
|
||||
parsed = float(value)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise SnapshotValidationError(f"{field} must be numeric") from exc
|
||||
return parsed
|
||||
|
||||
@classmethod
|
||||
def _validate_record(cls, record: dict[str, Any], index: int) -> dict[str, Any]:
|
||||
unknown = set(record.keys()) - ALLOWED_RECORD_FIELDS
|
||||
if unknown:
|
||||
raise SnapshotValidationError(
|
||||
f"record[{index}] contains unknown fields: {', '.join(sorted(unknown))}"
|
||||
)
|
||||
|
||||
missing = REQUIRED_RECORD_FIELDS - set(record.keys())
|
||||
if missing:
|
||||
raise SnapshotValidationError(
|
||||
f"record[{index}] missing required fields: {', '.join(sorted(missing))}"
|
||||
)
|
||||
|
||||
normalized = dict(record)
|
||||
for field in (
|
||||
"competition_external_id",
|
||||
"competition_name",
|
||||
"season",
|
||||
"team_external_id",
|
||||
"team_name",
|
||||
"player_external_id",
|
||||
"full_name",
|
||||
"first_name",
|
||||
"last_name",
|
||||
"nationality",
|
||||
"position",
|
||||
):
|
||||
normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}")
|
||||
|
||||
if record.get("role") is not None:
|
||||
normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role")
|
||||
|
||||
birth_date = parse_date(str(record.get("birth_date")))
|
||||
if not birth_date:
|
||||
raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD")
|
||||
normalized["birth_date"] = birth_date.isoformat()
|
||||
|
||||
normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm")
|
||||
normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg")
|
||||
normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played")
|
||||
|
||||
for field in (
|
||||
"minutes_per_game",
|
||||
"points_per_game",
|
||||
"rebounds_per_game",
|
||||
"assists_per_game",
|
||||
"steals_per_game",
|
||||
"blocks_per_game",
|
||||
"turnovers_per_game",
|
||||
"fg_pct",
|
||||
"three_pt_pct",
|
||||
"ft_pct",
|
||||
):
|
||||
normalized[field] = cls._require_float(record.get(field), f"record[{index}].{field}")
|
||||
|
||||
return normalized
|
||||
|
||||
@classmethod
|
||||
def validate(cls, payload: dict[str, Any]) -> SnapshotValidationResult:
|
||||
if not isinstance(payload, dict):
|
||||
raise SnapshotValidationError("Snapshot root must be an object")
|
||||
|
||||
unknown = set(payload.keys()) - ALLOWED_TOP_LEVEL_FIELDS
|
||||
if unknown:
|
||||
raise SnapshotValidationError(
|
||||
f"Snapshot contains unknown top-level fields: {', '.join(sorted(unknown))}"
|
||||
)
|
||||
|
||||
source_name = cls._require_string(payload.get("source_name"), "source_name")
|
||||
|
||||
snapshot_date_raw = payload.get("snapshot_date")
|
||||
snapshot_date = parse_date(str(snapshot_date_raw))
|
||||
if not snapshot_date:
|
||||
raise SnapshotValidationError("snapshot_date must be YYYY-MM-DD")
|
||||
|
||||
records = payload.get("records")
|
||||
if not isinstance(records, list) or not records:
|
||||
raise SnapshotValidationError("records must be a non-empty array")
|
||||
|
||||
normalized_records: list[dict[str, Any]] = []
|
||||
for index, record in enumerate(records):
|
||||
if not isinstance(record, dict):
|
||||
raise SnapshotValidationError(f"record[{index}] must be an object")
|
||||
normalized_records.append(cls._validate_record(record, index=index))
|
||||
|
||||
return SnapshotValidationResult(
|
||||
source_name=source_name,
|
||||
snapshot_date=snapshot_date,
|
||||
records=normalized_records,
|
||||
)
|
||||
@ -142,10 +142,19 @@ LOGIN_URL = "users:login"
|
||||
LOGIN_REDIRECT_URL = "core:dashboard"
|
||||
LOGOUT_REDIRECT_URL = "core:home"
|
||||
|
||||
# HoopScout v2 snapshot storage (volume-backed directories).
|
||||
SNAPSHOT_INCOMING_DIR = os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming"))
|
||||
SNAPSHOT_ARCHIVE_DIR = os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive"))
|
||||
SNAPSHOT_FAILED_DIR = os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed"))
|
||||
# HoopScout v2 static dataset storage (volume-backed directories).
|
||||
STATIC_DATASET_INCOMING_DIR = os.getenv(
|
||||
"STATIC_DATASET_INCOMING_DIR",
|
||||
os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming")),
|
||||
)
|
||||
STATIC_DATASET_ARCHIVE_DIR = os.getenv(
|
||||
"STATIC_DATASET_ARCHIVE_DIR",
|
||||
os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive")),
|
||||
)
|
||||
STATIC_DATASET_FAILED_DIR = os.getenv(
|
||||
"STATIC_DATASET_FAILED_DIR",
|
||||
os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")),
|
||||
)
|
||||
|
||||
# Optional scheduler command settings for future v2 snapshot jobs.
|
||||
SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False)
|
||||
|
||||
@ -8,9 +8,9 @@ done
|
||||
|
||||
echo "PostgreSQL is available."
|
||||
|
||||
mkdir -p "${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}" \
|
||||
"${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}" \
|
||||
"${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}"
|
||||
mkdir -p "${STATIC_DATASET_INCOMING_DIR:-${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}}" \
|
||||
"${STATIC_DATASET_ARCHIVE_DIR:-${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}}" \
|
||||
"${STATIC_DATASET_FAILED_DIR:-${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}}"
|
||||
|
||||
if [ "${DJANGO_SETTINGS_MODULE:-}" = "config.settings.production" ] && [ "$1" = "gunicorn" ]; then
|
||||
echo "Running Django deployment checks..."
|
||||
|
||||
189
tests/test_import_snapshots_command.py
Normal file
189
tests/test_import_snapshots_command.py
Normal file
@ -0,0 +1,189 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.core.management import call_command
|
||||
|
||||
from apps.competitions.models import Competition, Season
|
||||
from apps.ingestion.models import ImportFile, ImportRun
|
||||
from apps.players.models import Player
|
||||
from apps.stats.models import PlayerSeason, PlayerSeasonStats
|
||||
from apps.teams.models import Team
|
||||
|
||||
|
||||
def _valid_payload() -> dict:
|
||||
return {
|
||||
"source_name": "official_site_feed",
|
||||
"snapshot_date": "2026-03-13",
|
||||
"records": [
|
||||
{
|
||||
"competition_external_id": "comp-nba",
|
||||
"competition_name": "NBA",
|
||||
"season": "2025-2026",
|
||||
"team_external_id": "team-lal",
|
||||
"team_name": "Los Angeles Lakers",
|
||||
"player_external_id": "player-23",
|
||||
"full_name": "LeBron James",
|
||||
"first_name": "LeBron",
|
||||
"last_name": "James",
|
||||
"birth_date": "1984-12-30",
|
||||
"nationality": "US",
|
||||
"height_cm": 206,
|
||||
"weight_kg": 113,
|
||||
"position": "SF",
|
||||
"role": "Primary Creator",
|
||||
"games_played": 60,
|
||||
"minutes_per_game": 34.5,
|
||||
"points_per_game": 25.4,
|
||||
"rebounds_per_game": 7.2,
|
||||
"assists_per_game": 8.1,
|
||||
"steals_per_game": 1.3,
|
||||
"blocks_per_game": 0.7,
|
||||
"turnovers_per_game": 3.2,
|
||||
"fg_pct": 51.1,
|
||||
"three_pt_pct": 38.4,
|
||||
"ft_pct": 79.8,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _write_json(path: Path, payload: dict) -> None:
|
||||
path.write_text(json.dumps(payload), encoding="utf-8")
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_valid_snapshot_import(tmp_path, settings):
|
||||
incoming = tmp_path / "incoming"
|
||||
archive = tmp_path / "archive"
|
||||
failed = tmp_path / "failed"
|
||||
incoming.mkdir()
|
||||
archive.mkdir()
|
||||
failed.mkdir()
|
||||
|
||||
payload = _valid_payload()
|
||||
file_path = incoming / "nba-2026-03-13.json"
|
||||
_write_json(file_path, payload)
|
||||
|
||||
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||
|
||||
call_command("import_snapshots")
|
||||
|
||||
run = ImportRun.objects.get()
|
||||
assert run.status == ImportRun.RunStatus.SUCCESS
|
||||
assert run.files_processed == 1
|
||||
assert run.rows_upserted == 1
|
||||
|
||||
import_file = ImportFile.objects.get(import_run=run)
|
||||
assert import_file.status == ImportFile.FileStatus.SUCCESS
|
||||
assert import_file.source_name == "official_site_feed"
|
||||
assert import_file.snapshot_date == date(2026, 3, 13)
|
||||
|
||||
assert (archive / "nba-2026-03-13.json").exists()
|
||||
assert not (incoming / "nba-2026-03-13.json").exists()
|
||||
|
||||
assert Competition.objects.filter(source_uid="comp-nba").exists()
|
||||
assert Team.objects.filter(source_uid="team-lal").exists()
|
||||
assert Player.objects.filter(source_uid="player-23").exists()
|
||||
assert Season.objects.filter(source_uid="season:2025-2026").exists()
|
||||
assert PlayerSeason.objects.count() == 1
|
||||
assert PlayerSeasonStats.objects.count() == 1
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_invalid_snapshot_rejected_and_moved_to_failed(tmp_path, settings):
|
||||
incoming = tmp_path / "incoming"
|
||||
archive = tmp_path / "archive"
|
||||
failed = tmp_path / "failed"
|
||||
incoming.mkdir()
|
||||
archive.mkdir()
|
||||
failed.mkdir()
|
||||
|
||||
payload = _valid_payload()
|
||||
del payload["records"][0]["points_per_game"]
|
||||
file_path = incoming / "broken.json"
|
||||
_write_json(file_path, payload)
|
||||
|
||||
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||
|
||||
call_command("import_snapshots")
|
||||
|
||||
run = ImportRun.objects.get()
|
||||
assert run.status == ImportRun.RunStatus.FAILED
|
||||
|
||||
import_file = ImportFile.objects.get(import_run=run)
|
||||
assert import_file.status == ImportFile.FileStatus.FAILED
|
||||
assert "missing required fields" in import_file.error_message
|
||||
|
||||
assert (failed / "broken.json").exists()
|
||||
assert not (archive / "broken.json").exists()
|
||||
assert not Competition.objects.exists()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_idempotent_reimport_uses_checksum_and_skips_duplicate(tmp_path, settings):
|
||||
incoming = tmp_path / "incoming"
|
||||
archive = tmp_path / "archive"
|
||||
failed = tmp_path / "failed"
|
||||
incoming.mkdir()
|
||||
archive.mkdir()
|
||||
failed.mkdir()
|
||||
|
||||
payload = _valid_payload()
|
||||
_write_json(incoming / "first.json", payload)
|
||||
|
||||
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||
|
||||
call_command("import_snapshots")
|
||||
assert Competition.objects.count() == 1
|
||||
assert Player.objects.count() == 1
|
||||
|
||||
# Re-drop same content with different filename.
|
||||
_write_json(incoming / "first-duplicate.json", payload)
|
||||
call_command("import_snapshots")
|
||||
|
||||
assert Competition.objects.count() == 1
|
||||
assert Player.objects.count() == 1
|
||||
assert PlayerSeason.objects.count() == 1
|
||||
|
||||
duplicate_file = ImportFile.objects.filter(relative_path="first-duplicate.json").order_by("-id").first()
|
||||
assert duplicate_file is not None
|
||||
assert duplicate_file.status == ImportFile.FileStatus.SKIPPED
|
||||
assert duplicate_file.checksum
|
||||
assert "duplicate checksum" in duplicate_file.error_message.lower()
|
||||
|
||||
assert (archive / "first-duplicate.json").exists()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_same_run_second_file_same_checksum_is_skipped(tmp_path, settings):
|
||||
incoming = tmp_path / "incoming"
|
||||
archive = tmp_path / "archive"
|
||||
failed = tmp_path / "failed"
|
||||
incoming.mkdir()
|
||||
archive.mkdir()
|
||||
failed.mkdir()
|
||||
|
||||
payload = _valid_payload()
|
||||
_write_json(incoming / "a.json", payload)
|
||||
_write_json(incoming / "b.json", payload)
|
||||
|
||||
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||
|
||||
call_command("import_snapshots")
|
||||
|
||||
files = {row.relative_path: row for row in ImportFile.objects.order_by("relative_path")}
|
||||
assert files["a.json"].status == ImportFile.FileStatus.SUCCESS
|
||||
assert files["b.json"].status == ImportFile.FileStatus.SKIPPED
|
||||
assert files["a.json"].checksum == files["b.json"].checksum
|
||||
Reference in New Issue
Block a user