Add v2 snapshot schema validation and import_snapshots command
This commit is contained in:
@ -31,10 +31,10 @@ POSTGRES_PORT=5432
|
|||||||
LOCAL_UID=1000
|
LOCAL_UID=1000
|
||||||
LOCAL_GID=1000
|
LOCAL_GID=1000
|
||||||
|
|
||||||
# Snapshot storage (volume-backed directories)
|
# Static dataset storage (volume-backed directories)
|
||||||
SNAPSHOT_INCOMING_DIR=/app/snapshots/incoming
|
STATIC_DATASET_INCOMING_DIR=/app/snapshots/incoming
|
||||||
SNAPSHOT_ARCHIVE_DIR=/app/snapshots/archive
|
STATIC_DATASET_ARCHIVE_DIR=/app/snapshots/archive
|
||||||
SNAPSHOT_FAILED_DIR=/app/snapshots/failed
|
STATIC_DATASET_FAILED_DIR=/app/snapshots/failed
|
||||||
|
|
||||||
# Future optional scheduler loop settings (not enabled in base v2 runtime)
|
# Future optional scheduler loop settings (not enabled in base v2 runtime)
|
||||||
SCHEDULER_ENABLED=0
|
SCHEDULER_ENABLED=0
|
||||||
|
|||||||
84
README.md
84
README.md
@ -8,10 +8,9 @@ Current v2 foundation scope in this branch:
|
|||||||
- nginx reverse proxy
|
- nginx reverse proxy
|
||||||
- management-command-driven runtime operations
|
- management-command-driven runtime operations
|
||||||
- static snapshot directories persisted via Docker named volumes
|
- static snapshot directories persisted via Docker named volumes
|
||||||
|
- strict JSON snapshot schema + import management command
|
||||||
|
|
||||||
Out of scope in this step:
|
Out of scope in this step:
|
||||||
- domain model redesign
|
|
||||||
- snapshot importer implementation
|
|
||||||
- extractor implementation
|
- extractor implementation
|
||||||
|
|
||||||
## Runtime Architecture (v2)
|
## Runtime Architecture (v2)
|
||||||
@ -81,7 +80,7 @@ Core groups:
|
|||||||
- Django runtime/security vars
|
- Django runtime/security vars
|
||||||
- PostgreSQL connection vars
|
- PostgreSQL connection vars
|
||||||
- image tag vars (`APP_IMAGE_TAG`, `NGINX_IMAGE_TAG`)
|
- image tag vars (`APP_IMAGE_TAG`, `NGINX_IMAGE_TAG`)
|
||||||
- snapshot directory vars (`SNAPSHOT_*`)
|
- snapshot directory vars (`STATIC_DATASET_*`)
|
||||||
- optional future scheduler vars (`SCHEDULER_*`)
|
- optional future scheduler vars (`SCHEDULER_*`)
|
||||||
|
|
||||||
## Snapshot Storage Convention
|
## Snapshot Storage Convention
|
||||||
@ -91,7 +90,84 @@ Snapshot files are expected under:
|
|||||||
- archive: `/app/snapshots/archive`
|
- archive: `/app/snapshots/archive`
|
||||||
- failed: `/app/snapshots/failed`
|
- failed: `/app/snapshots/failed`
|
||||||
|
|
||||||
In this foundation step, directories are created and persisted but no importer/extractor is implemented yet.
|
Configured via environment:
|
||||||
|
- `STATIC_DATASET_INCOMING_DIR`
|
||||||
|
- `STATIC_DATASET_ARCHIVE_DIR`
|
||||||
|
- `STATIC_DATASET_FAILED_DIR`
|
||||||
|
|
||||||
|
## Snapshot JSON Schema (MVP)
|
||||||
|
|
||||||
|
Each file must be a JSON object:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"source_name": "official_site_feed",
|
||||||
|
"snapshot_date": "2026-03-13",
|
||||||
|
"records": [
|
||||||
|
{
|
||||||
|
"competition_external_id": "comp-nba",
|
||||||
|
"competition_name": "NBA",
|
||||||
|
"season": "2025-2026",
|
||||||
|
"team_external_id": "team-lal",
|
||||||
|
"team_name": "Los Angeles Lakers",
|
||||||
|
"player_external_id": "player-23",
|
||||||
|
"full_name": "LeBron James",
|
||||||
|
"first_name": "LeBron",
|
||||||
|
"last_name": "James",
|
||||||
|
"birth_date": "1984-12-30",
|
||||||
|
"nationality": "US",
|
||||||
|
"height_cm": 206,
|
||||||
|
"weight_kg": 113,
|
||||||
|
"position": "SF",
|
||||||
|
"role": "Primary Creator",
|
||||||
|
"games_played": 60,
|
||||||
|
"minutes_per_game": 34.5,
|
||||||
|
"points_per_game": 25.4,
|
||||||
|
"rebounds_per_game": 7.2,
|
||||||
|
"assists_per_game": 8.1,
|
||||||
|
"steals_per_game": 1.3,
|
||||||
|
"blocks_per_game": 0.7,
|
||||||
|
"turnovers_per_game": 3.2,
|
||||||
|
"fg_pct": 51.1,
|
||||||
|
"three_pt_pct": 38.4,
|
||||||
|
"ft_pct": 79.8,
|
||||||
|
"source_metadata": {},
|
||||||
|
"raw_payload": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source_metadata": {},
|
||||||
|
"raw_payload": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Validation is strict:
|
||||||
|
- unknown fields are rejected
|
||||||
|
- required fields must exist
|
||||||
|
- `snapshot_date` and `birth_date` must be `YYYY-MM-DD`
|
||||||
|
- numeric fields must be numeric
|
||||||
|
- invalid files are moved to failed directory
|
||||||
|
|
||||||
|
## Import Command
|
||||||
|
|
||||||
|
Run import:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec web python manage.py import_snapshots
|
||||||
|
```
|
||||||
|
|
||||||
|
Command behavior:
|
||||||
|
- scans `STATIC_DATASET_INCOMING_DIR` for `.json` files
|
||||||
|
- validates strict schema
|
||||||
|
- computes SHA-256 checksum
|
||||||
|
- creates `ImportRun` + `ImportFile` records
|
||||||
|
- upserts relational entities (`Competition`, `Season`, `Team`, `Player`, `PlayerSeason`, `PlayerSeasonStats`)
|
||||||
|
- skips duplicate content using checksum
|
||||||
|
- moves valid files to archive
|
||||||
|
- moves invalid files to failed
|
||||||
|
|
||||||
|
Import history is visible in Django admin:
|
||||||
|
- `ImportRun`
|
||||||
|
- `ImportFile`
|
||||||
|
|
||||||
## Migration and Superuser Commands
|
## Migration and Superuser Commands
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,8 @@ class ImportFileInline(admin.TabularInline):
|
|||||||
extra = 0
|
extra = 0
|
||||||
readonly_fields = (
|
readonly_fields = (
|
||||||
"relative_path",
|
"relative_path",
|
||||||
|
"source_name",
|
||||||
|
"snapshot_date",
|
||||||
"status",
|
"status",
|
||||||
"checksum",
|
"checksum",
|
||||||
"file_size_bytes",
|
"file_size_bytes",
|
||||||
@ -61,6 +63,8 @@ class ImportFileAdmin(admin.ModelAdmin):
|
|||||||
"id",
|
"id",
|
||||||
"import_run",
|
"import_run",
|
||||||
"relative_path",
|
"relative_path",
|
||||||
|
"source_name",
|
||||||
|
"snapshot_date",
|
||||||
"status",
|
"status",
|
||||||
"rows_total",
|
"rows_total",
|
||||||
"rows_upserted",
|
"rows_upserted",
|
||||||
@ -68,10 +72,12 @@ class ImportFileAdmin(admin.ModelAdmin):
|
|||||||
"processed_at",
|
"processed_at",
|
||||||
)
|
)
|
||||||
list_filter = ("status",)
|
list_filter = ("status",)
|
||||||
search_fields = ("relative_path", "checksum", "error_message")
|
search_fields = ("relative_path", "source_name", "checksum", "error_message")
|
||||||
readonly_fields = (
|
readonly_fields = (
|
||||||
"import_run",
|
"import_run",
|
||||||
"relative_path",
|
"relative_path",
|
||||||
|
"source_name",
|
||||||
|
"snapshot_date",
|
||||||
"status",
|
"status",
|
||||||
"checksum",
|
"checksum",
|
||||||
"file_size_bytes",
|
"file_size_bytes",
|
||||||
|
|||||||
0
apps/ingestion/management/__init__.py
Normal file
0
apps/ingestion/management/__init__.py
Normal file
0
apps/ingestion/management/commands/__init__.py
Normal file
0
apps/ingestion/management/commands/__init__.py
Normal file
23
apps/ingestion/management/commands/import_snapshots.py
Normal file
23
apps/ingestion/management/commands/import_snapshots.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from django.conf import settings
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from apps.ingestion.services.snapshot_import import SnapshotImporter
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "Import static JSON snapshots from incoming directory into PostgreSQL."
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
importer = SnapshotImporter(
|
||||||
|
incoming_dir=settings.STATIC_DATASET_INCOMING_DIR,
|
||||||
|
archive_dir=settings.STATIC_DATASET_ARCHIVE_DIR,
|
||||||
|
failed_dir=settings.STATIC_DATASET_FAILED_DIR,
|
||||||
|
)
|
||||||
|
run = importer.run()
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(
|
||||||
|
f"Import run {run.id} completed: status={run.status} "
|
||||||
|
f"files={run.files_processed}/{run.files_total} "
|
||||||
|
f"rows_upserted={run.rows_upserted} rows_failed={run.rows_failed}"
|
||||||
|
)
|
||||||
|
)
|
||||||
@ -0,0 +1,27 @@
|
|||||||
|
# Generated by Django 5.2.12 on 2026-03-13 12:59
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('ingestion', '0003_importrun_importfile_and_more'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='importfile',
|
||||||
|
name='snapshot_date',
|
||||||
|
field=models.DateField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='importfile',
|
||||||
|
name='source_name',
|
||||||
|
field=models.CharField(blank=True, max_length=120),
|
||||||
|
),
|
||||||
|
migrations.AddIndex(
|
||||||
|
model_name='importfile',
|
||||||
|
index=models.Index(fields=['source_name', 'snapshot_date'], name='ingestion_i_source__de6843_idx'),
|
||||||
|
),
|
||||||
|
]
|
||||||
@ -57,6 +57,8 @@ class ImportFile(models.Model):
|
|||||||
related_name="files",
|
related_name="files",
|
||||||
)
|
)
|
||||||
relative_path = models.CharField(max_length=260)
|
relative_path = models.CharField(max_length=260)
|
||||||
|
source_name = models.CharField(max_length=120, blank=True)
|
||||||
|
snapshot_date = models.DateField(blank=True, null=True)
|
||||||
status = models.CharField(max_length=24, choices=FileStatus.choices, default=FileStatus.PENDING)
|
status = models.CharField(max_length=24, choices=FileStatus.choices, default=FileStatus.PENDING)
|
||||||
checksum = models.CharField(max_length=128, blank=True)
|
checksum = models.CharField(max_length=128, blank=True)
|
||||||
file_size_bytes = models.PositiveBigIntegerField(blank=True, null=True)
|
file_size_bytes = models.PositiveBigIntegerField(blank=True, null=True)
|
||||||
@ -79,6 +81,7 @@ class ImportFile(models.Model):
|
|||||||
indexes = [
|
indexes = [
|
||||||
models.Index(fields=["import_run", "status"]),
|
models.Index(fields=["import_run", "status"]),
|
||||||
models.Index(fields=["relative_path"]),
|
models.Index(fields=["relative_path"]),
|
||||||
|
models.Index(fields=["source_name", "snapshot_date"]),
|
||||||
models.Index(fields=["processed_at"]),
|
models.Index(fields=["processed_at"]),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
310
apps/ingestion/services/snapshot_import.py
Normal file
310
apps/ingestion/services/snapshot_import.py
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import date, datetime
|
||||||
|
from decimal import Decimal
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from django.db import transaction
|
||||||
|
from django.template.defaultfilters import slugify
|
||||||
|
from django.utils import timezone
|
||||||
|
from django.utils.dateparse import parse_date
|
||||||
|
|
||||||
|
from apps.competitions.models import Competition, Season
|
||||||
|
from apps.ingestion.models import ImportFile, ImportRun
|
||||||
|
from apps.ingestion.snapshots import SnapshotSchemaValidator, SnapshotValidationError
|
||||||
|
from apps.players.models import Nationality, Player, Position, Role
|
||||||
|
from apps.stats.models import PlayerSeason, PlayerSeasonStats
|
||||||
|
from apps.teams.models import Team
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ImportSummary:
|
||||||
|
files_total: int = 0
|
||||||
|
files_processed: int = 0
|
||||||
|
rows_total: int = 0
|
||||||
|
rows_upserted: int = 0
|
||||||
|
rows_failed: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_move(src: Path, destination_dir: Path) -> Path:
|
||||||
|
destination_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
candidate = destination_dir / src.name
|
||||||
|
if candidate.exists():
|
||||||
|
ts = datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
||||||
|
candidate = destination_dir / f"{src.stem}-{ts}{src.suffix}"
|
||||||
|
shutil.move(str(src), str(candidate))
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
|
||||||
|
def _file_checksum(path: Path) -> str:
|
||||||
|
digest = hashlib.sha256()
|
||||||
|
with path.open("rb") as handle:
|
||||||
|
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||||
|
digest.update(chunk)
|
||||||
|
return digest.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_decimal(value: float | int | str) -> Decimal:
|
||||||
|
return Decimal(str(value)).quantize(Decimal("0.01"))
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_season_dates(label: str) -> tuple[date, date]:
|
||||||
|
if "-" in label:
|
||||||
|
first = label.split("-", 1)[0]
|
||||||
|
else:
|
||||||
|
first = label
|
||||||
|
year = int(first)
|
||||||
|
return date(year, 9, 1), date(year + 1, 7, 31)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_nationality(value: str | None) -> Nationality | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
token = value.strip()
|
||||||
|
if not token:
|
||||||
|
return None
|
||||||
|
if len(token) == 2:
|
||||||
|
code = token.upper()
|
||||||
|
obj, _ = Nationality.objects.get_or_create(
|
||||||
|
iso2_code=code,
|
||||||
|
defaults={"name": code},
|
||||||
|
)
|
||||||
|
return obj
|
||||||
|
return Nationality.objects.filter(name__iexact=token).first()
|
||||||
|
|
||||||
|
|
||||||
|
def _position_code(position_value: str) -> str:
|
||||||
|
token = position_value.strip().upper().replace(" ", "_")
|
||||||
|
return (token[:10] or "UNK")
|
||||||
|
|
||||||
|
|
||||||
|
def _role_code(role_value: str) -> str:
|
||||||
|
token = slugify(role_value).replace("-", "_")
|
||||||
|
return (token[:32] or "unknown")
|
||||||
|
|
||||||
|
|
||||||
|
def _player_season_source_uid(record: dict[str, Any], source_name: str, snapshot_date: date) -> str:
|
||||||
|
return (
|
||||||
|
f"{source_name}:{snapshot_date.isoformat()}:"
|
||||||
|
f"{record['competition_external_id']}:{record['season']}:"
|
||||||
|
f"{record['team_external_id']}:{record['player_external_id']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: date) -> None:
|
||||||
|
competition_slug = slugify(record["competition_name"]) or f"competition-{record['competition_external_id']}"
|
||||||
|
competition, _ = Competition.objects.update_or_create(
|
||||||
|
source_uid=record["competition_external_id"],
|
||||||
|
defaults={
|
||||||
|
"name": record["competition_name"],
|
||||||
|
"slug": competition_slug,
|
||||||
|
"competition_type": Competition.CompetitionType.LEAGUE,
|
||||||
|
"is_active": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
start_date, end_date = _parse_season_dates(record["season"])
|
||||||
|
season, _ = Season.objects.update_or_create(
|
||||||
|
source_uid=f"season:{record['season']}",
|
||||||
|
defaults={
|
||||||
|
"label": record["season"],
|
||||||
|
"start_date": start_date,
|
||||||
|
"end_date": end_date,
|
||||||
|
"is_current": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
team_slug = slugify(record["team_name"]) or f"team-{record['team_external_id']}"
|
||||||
|
team, _ = Team.objects.update_or_create(
|
||||||
|
source_uid=record["team_external_id"],
|
||||||
|
defaults={
|
||||||
|
"name": record["team_name"],
|
||||||
|
"slug": team_slug,
|
||||||
|
"short_name": "",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
position, _ = Position.objects.get_or_create(
|
||||||
|
code=_position_code(record["position"]),
|
||||||
|
defaults={"name": record["position"]},
|
||||||
|
)
|
||||||
|
role = None
|
||||||
|
if record.get("role"):
|
||||||
|
role, _ = Role.objects.get_or_create(
|
||||||
|
code=_role_code(record["role"]),
|
||||||
|
defaults={"name": record["role"]},
|
||||||
|
)
|
||||||
|
|
||||||
|
player, _ = Player.objects.update_or_create(
|
||||||
|
source_uid=record["player_external_id"],
|
||||||
|
defaults={
|
||||||
|
"first_name": record["first_name"],
|
||||||
|
"last_name": record["last_name"],
|
||||||
|
"full_name": record["full_name"],
|
||||||
|
"birth_date": parse_date(record["birth_date"]),
|
||||||
|
"nationality": _resolve_nationality(record.get("nationality")),
|
||||||
|
"nominal_position": position,
|
||||||
|
"inferred_role": role,
|
||||||
|
"height_cm": record["height_cm"],
|
||||||
|
"weight_kg": record["weight_kg"],
|
||||||
|
"is_active": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
player_season, _ = PlayerSeason.objects.update_or_create(
|
||||||
|
source_uid=_player_season_source_uid(record, source_name=source_name, snapshot_date=snapshot_date),
|
||||||
|
defaults={
|
||||||
|
"player": player,
|
||||||
|
"season": season,
|
||||||
|
"team": team,
|
||||||
|
"competition": competition,
|
||||||
|
"games_played": int(record["games_played"]),
|
||||||
|
"games_started": 0,
|
||||||
|
"minutes_played": int(round(float(record["minutes_per_game"]) * int(record["games_played"]))),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
PlayerSeasonStats.objects.update_or_create(
|
||||||
|
player_season=player_season,
|
||||||
|
defaults={
|
||||||
|
"points": _normalize_decimal(record["points_per_game"]),
|
||||||
|
"rebounds": _normalize_decimal(record["rebounds_per_game"]),
|
||||||
|
"assists": _normalize_decimal(record["assists_per_game"]),
|
||||||
|
"steals": _normalize_decimal(record["steals_per_game"]),
|
||||||
|
"blocks": _normalize_decimal(record["blocks_per_game"]),
|
||||||
|
"turnovers": _normalize_decimal(record["turnovers_per_game"]),
|
||||||
|
"fg_pct": _normalize_decimal(record["fg_pct"]),
|
||||||
|
"three_pct": _normalize_decimal(record["three_pt_pct"]),
|
||||||
|
"ft_pct": _normalize_decimal(record["ft_pct"]),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotImporter:
|
||||||
|
def __init__(self, *, incoming_dir: str, archive_dir: str, failed_dir: str):
|
||||||
|
self.incoming_dir = Path(incoming_dir)
|
||||||
|
self.archive_dir = Path(archive_dir)
|
||||||
|
self.failed_dir = Path(failed_dir)
|
||||||
|
|
||||||
|
def _list_input_files(self) -> list[Path]:
|
||||||
|
if not self.incoming_dir.exists():
|
||||||
|
return []
|
||||||
|
return sorted(path for path in self.incoming_dir.iterdir() if path.is_file() and path.suffix.lower() == ".json")
|
||||||
|
|
||||||
|
def run(self, *, triggered_by=None) -> ImportRun:
|
||||||
|
run = ImportRun.objects.create(
|
||||||
|
source="static_snapshot_json",
|
||||||
|
status=ImportRun.RunStatus.RUNNING,
|
||||||
|
triggered_by=triggered_by,
|
||||||
|
started_at=timezone.now(),
|
||||||
|
context={
|
||||||
|
"incoming_dir": str(self.incoming_dir),
|
||||||
|
"archive_dir": str(self.archive_dir),
|
||||||
|
"failed_dir": str(self.failed_dir),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
summary = ImportSummary()
|
||||||
|
files = self._list_input_files()
|
||||||
|
summary.files_total = len(files)
|
||||||
|
|
||||||
|
for path in files:
|
||||||
|
checksum = _file_checksum(path)
|
||||||
|
file_row = ImportFile.objects.create(
|
||||||
|
import_run=run,
|
||||||
|
relative_path=path.name,
|
||||||
|
status=ImportFile.FileStatus.PROCESSING,
|
||||||
|
checksum=checksum,
|
||||||
|
file_size_bytes=path.stat().st_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Duplicate file content previously imported successfully.
|
||||||
|
already_imported = ImportFile.objects.filter(
|
||||||
|
checksum=checksum,
|
||||||
|
status=ImportFile.FileStatus.SUCCESS,
|
||||||
|
).exclude(pk=file_row.pk).exists()
|
||||||
|
if already_imported:
|
||||||
|
file_row.status = ImportFile.FileStatus.SKIPPED
|
||||||
|
file_row.error_message = "Skipped duplicate checksum already imported successfully."
|
||||||
|
file_row.processed_at = timezone.now()
|
||||||
|
file_row.save(update_fields=["status", "error_message", "processed_at"])
|
||||||
|
_safe_move(path, self.archive_dir)
|
||||||
|
summary.files_processed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
validated = SnapshotSchemaValidator.validate(payload)
|
||||||
|
|
||||||
|
file_row.source_name = validated.source_name
|
||||||
|
file_row.snapshot_date = validated.snapshot_date
|
||||||
|
file_row.rows_total = len(validated.records)
|
||||||
|
|
||||||
|
with transaction.atomic():
|
||||||
|
for record in validated.records:
|
||||||
|
_upsert_record(record, source_name=validated.source_name, snapshot_date=validated.snapshot_date)
|
||||||
|
|
||||||
|
file_row.status = ImportFile.FileStatus.SUCCESS
|
||||||
|
file_row.rows_upserted = len(validated.records)
|
||||||
|
file_row.payload_preview = {
|
||||||
|
"source_name": validated.source_name,
|
||||||
|
"snapshot_date": validated.snapshot_date.isoformat(),
|
||||||
|
"sample_record": validated.records[0],
|
||||||
|
}
|
||||||
|
_safe_move(path, self.archive_dir)
|
||||||
|
except (json.JSONDecodeError, SnapshotValidationError, ValueError) as exc:
|
||||||
|
file_row.status = ImportFile.FileStatus.FAILED
|
||||||
|
file_row.error_message = str(exc)
|
||||||
|
_safe_move(path, self.failed_dir)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
file_row.status = ImportFile.FileStatus.FAILED
|
||||||
|
file_row.error_message = f"Unhandled import error: {exc}"
|
||||||
|
_safe_move(path, self.failed_dir)
|
||||||
|
|
||||||
|
file_row.processed_at = timezone.now()
|
||||||
|
file_row.save(
|
||||||
|
update_fields=[
|
||||||
|
"source_name",
|
||||||
|
"snapshot_date",
|
||||||
|
"status",
|
||||||
|
"rows_total",
|
||||||
|
"rows_upserted",
|
||||||
|
"rows_failed",
|
||||||
|
"error_message",
|
||||||
|
"payload_preview",
|
||||||
|
"processed_at",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
summary.files_processed += 1
|
||||||
|
summary.rows_total += file_row.rows_total
|
||||||
|
summary.rows_upserted += file_row.rows_upserted
|
||||||
|
summary.rows_failed += file_row.rows_failed + (1 if file_row.status == ImportFile.FileStatus.FAILED else 0)
|
||||||
|
|
||||||
|
run.status = ImportRun.RunStatus.SUCCESS if summary.rows_failed == 0 else ImportRun.RunStatus.FAILED
|
||||||
|
run.files_total = summary.files_total
|
||||||
|
run.files_processed = summary.files_processed
|
||||||
|
run.rows_total = summary.rows_total
|
||||||
|
run.rows_upserted = summary.rows_upserted
|
||||||
|
run.rows_failed = summary.rows_failed
|
||||||
|
run.finished_at = timezone.now()
|
||||||
|
if summary.rows_failed:
|
||||||
|
run.error_summary = f"{summary.rows_failed} file/row import error(s)."
|
||||||
|
run.save(
|
||||||
|
update_fields=[
|
||||||
|
"status",
|
||||||
|
"files_total",
|
||||||
|
"files_processed",
|
||||||
|
"rows_total",
|
||||||
|
"rows_upserted",
|
||||||
|
"rows_failed",
|
||||||
|
"error_summary",
|
||||||
|
"finished_at",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return run
|
||||||
3
apps/ingestion/snapshots/__init__.py
Normal file
3
apps/ingestion/snapshots/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from .schema import SnapshotSchemaValidator, SnapshotValidationError, SnapshotValidationResult
|
||||||
|
|
||||||
|
__all__ = ["SnapshotSchemaValidator", "SnapshotValidationError", "SnapshotValidationResult"]
|
||||||
182
apps/ingestion/snapshots/schema.py
Normal file
182
apps/ingestion/snapshots/schema.py
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import date
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from django.utils.dateparse import parse_date
|
||||||
|
|
||||||
|
REQUIRED_RECORD_FIELDS = {
|
||||||
|
"competition_external_id",
|
||||||
|
"competition_name",
|
||||||
|
"season",
|
||||||
|
"team_external_id",
|
||||||
|
"team_name",
|
||||||
|
"player_external_id",
|
||||||
|
"full_name",
|
||||||
|
"first_name",
|
||||||
|
"last_name",
|
||||||
|
"birth_date",
|
||||||
|
"nationality",
|
||||||
|
"height_cm",
|
||||||
|
"weight_kg",
|
||||||
|
"position",
|
||||||
|
"games_played",
|
||||||
|
"minutes_per_game",
|
||||||
|
"points_per_game",
|
||||||
|
"rebounds_per_game",
|
||||||
|
"assists_per_game",
|
||||||
|
"steals_per_game",
|
||||||
|
"blocks_per_game",
|
||||||
|
"turnovers_per_game",
|
||||||
|
"fg_pct",
|
||||||
|
"three_pt_pct",
|
||||||
|
"ft_pct",
|
||||||
|
}
|
||||||
|
|
||||||
|
ALLOWED_TOP_LEVEL_FIELDS = {
|
||||||
|
"source_name",
|
||||||
|
"snapshot_date",
|
||||||
|
"records",
|
||||||
|
"source_metadata",
|
||||||
|
"raw_payload",
|
||||||
|
}
|
||||||
|
|
||||||
|
ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | {
|
||||||
|
"role",
|
||||||
|
"source_metadata",
|
||||||
|
"raw_payload",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SnapshotValidationResult:
|
||||||
|
source_name: str
|
||||||
|
snapshot_date: date
|
||||||
|
records: list[dict[str, Any]]
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotValidationError(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotSchemaValidator:
|
||||||
|
"""Strict JSON schema validator for HoopScout v2 player-season snapshots."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _require_string(value: Any, field: str) -> str:
|
||||||
|
if not isinstance(value, str) or not value.strip():
|
||||||
|
raise SnapshotValidationError(f"{field} must be a non-empty string")
|
||||||
|
return value.strip()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _require_non_negative_int(value: Any, field: str) -> int:
|
||||||
|
if isinstance(value, bool):
|
||||||
|
raise SnapshotValidationError(f"{field} must be a non-negative integer")
|
||||||
|
try:
|
||||||
|
parsed = int(value)
|
||||||
|
except (TypeError, ValueError) as exc:
|
||||||
|
raise SnapshotValidationError(f"{field} must be a non-negative integer") from exc
|
||||||
|
if parsed < 0:
|
||||||
|
raise SnapshotValidationError(f"{field} must be a non-negative integer")
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _require_float(value: Any, field: str) -> float:
|
||||||
|
try:
|
||||||
|
parsed = float(value)
|
||||||
|
except (TypeError, ValueError) as exc:
|
||||||
|
raise SnapshotValidationError(f"{field} must be numeric") from exc
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _validate_record(cls, record: dict[str, Any], index: int) -> dict[str, Any]:
|
||||||
|
unknown = set(record.keys()) - ALLOWED_RECORD_FIELDS
|
||||||
|
if unknown:
|
||||||
|
raise SnapshotValidationError(
|
||||||
|
f"record[{index}] contains unknown fields: {', '.join(sorted(unknown))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
missing = REQUIRED_RECORD_FIELDS - set(record.keys())
|
||||||
|
if missing:
|
||||||
|
raise SnapshotValidationError(
|
||||||
|
f"record[{index}] missing required fields: {', '.join(sorted(missing))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
normalized = dict(record)
|
||||||
|
for field in (
|
||||||
|
"competition_external_id",
|
||||||
|
"competition_name",
|
||||||
|
"season",
|
||||||
|
"team_external_id",
|
||||||
|
"team_name",
|
||||||
|
"player_external_id",
|
||||||
|
"full_name",
|
||||||
|
"first_name",
|
||||||
|
"last_name",
|
||||||
|
"nationality",
|
||||||
|
"position",
|
||||||
|
):
|
||||||
|
normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}")
|
||||||
|
|
||||||
|
if record.get("role") is not None:
|
||||||
|
normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role")
|
||||||
|
|
||||||
|
birth_date = parse_date(str(record.get("birth_date")))
|
||||||
|
if not birth_date:
|
||||||
|
raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD")
|
||||||
|
normalized["birth_date"] = birth_date.isoformat()
|
||||||
|
|
||||||
|
normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm")
|
||||||
|
normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg")
|
||||||
|
normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played")
|
||||||
|
|
||||||
|
for field in (
|
||||||
|
"minutes_per_game",
|
||||||
|
"points_per_game",
|
||||||
|
"rebounds_per_game",
|
||||||
|
"assists_per_game",
|
||||||
|
"steals_per_game",
|
||||||
|
"blocks_per_game",
|
||||||
|
"turnovers_per_game",
|
||||||
|
"fg_pct",
|
||||||
|
"three_pt_pct",
|
||||||
|
"ft_pct",
|
||||||
|
):
|
||||||
|
normalized[field] = cls._require_float(record.get(field), f"record[{index}].{field}")
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate(cls, payload: dict[str, Any]) -> SnapshotValidationResult:
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
raise SnapshotValidationError("Snapshot root must be an object")
|
||||||
|
|
||||||
|
unknown = set(payload.keys()) - ALLOWED_TOP_LEVEL_FIELDS
|
||||||
|
if unknown:
|
||||||
|
raise SnapshotValidationError(
|
||||||
|
f"Snapshot contains unknown top-level fields: {', '.join(sorted(unknown))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
source_name = cls._require_string(payload.get("source_name"), "source_name")
|
||||||
|
|
||||||
|
snapshot_date_raw = payload.get("snapshot_date")
|
||||||
|
snapshot_date = parse_date(str(snapshot_date_raw))
|
||||||
|
if not snapshot_date:
|
||||||
|
raise SnapshotValidationError("snapshot_date must be YYYY-MM-DD")
|
||||||
|
|
||||||
|
records = payload.get("records")
|
||||||
|
if not isinstance(records, list) or not records:
|
||||||
|
raise SnapshotValidationError("records must be a non-empty array")
|
||||||
|
|
||||||
|
normalized_records: list[dict[str, Any]] = []
|
||||||
|
for index, record in enumerate(records):
|
||||||
|
if not isinstance(record, dict):
|
||||||
|
raise SnapshotValidationError(f"record[{index}] must be an object")
|
||||||
|
normalized_records.append(cls._validate_record(record, index=index))
|
||||||
|
|
||||||
|
return SnapshotValidationResult(
|
||||||
|
source_name=source_name,
|
||||||
|
snapshot_date=snapshot_date,
|
||||||
|
records=normalized_records,
|
||||||
|
)
|
||||||
@ -142,10 +142,19 @@ LOGIN_URL = "users:login"
|
|||||||
LOGIN_REDIRECT_URL = "core:dashboard"
|
LOGIN_REDIRECT_URL = "core:dashboard"
|
||||||
LOGOUT_REDIRECT_URL = "core:home"
|
LOGOUT_REDIRECT_URL = "core:home"
|
||||||
|
|
||||||
# HoopScout v2 snapshot storage (volume-backed directories).
|
# HoopScout v2 static dataset storage (volume-backed directories).
|
||||||
SNAPSHOT_INCOMING_DIR = os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming"))
|
STATIC_DATASET_INCOMING_DIR = os.getenv(
|
||||||
SNAPSHOT_ARCHIVE_DIR = os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive"))
|
"STATIC_DATASET_INCOMING_DIR",
|
||||||
SNAPSHOT_FAILED_DIR = os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed"))
|
os.getenv("SNAPSHOT_INCOMING_DIR", str(BASE_DIR / "snapshots" / "incoming")),
|
||||||
|
)
|
||||||
|
STATIC_DATASET_ARCHIVE_DIR = os.getenv(
|
||||||
|
"STATIC_DATASET_ARCHIVE_DIR",
|
||||||
|
os.getenv("SNAPSHOT_ARCHIVE_DIR", str(BASE_DIR / "snapshots" / "archive")),
|
||||||
|
)
|
||||||
|
STATIC_DATASET_FAILED_DIR = os.getenv(
|
||||||
|
"STATIC_DATASET_FAILED_DIR",
|
||||||
|
os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")),
|
||||||
|
)
|
||||||
|
|
||||||
# Optional scheduler command settings for future v2 snapshot jobs.
|
# Optional scheduler command settings for future v2 snapshot jobs.
|
||||||
SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False)
|
SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False)
|
||||||
|
|||||||
@ -8,9 +8,9 @@ done
|
|||||||
|
|
||||||
echo "PostgreSQL is available."
|
echo "PostgreSQL is available."
|
||||||
|
|
||||||
mkdir -p "${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}" \
|
mkdir -p "${STATIC_DATASET_INCOMING_DIR:-${SNAPSHOT_INCOMING_DIR:-/app/snapshots/incoming}}" \
|
||||||
"${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}" \
|
"${STATIC_DATASET_ARCHIVE_DIR:-${SNAPSHOT_ARCHIVE_DIR:-/app/snapshots/archive}}" \
|
||||||
"${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}"
|
"${STATIC_DATASET_FAILED_DIR:-${SNAPSHOT_FAILED_DIR:-/app/snapshots/failed}}"
|
||||||
|
|
||||||
if [ "${DJANGO_SETTINGS_MODULE:-}" = "config.settings.production" ] && [ "$1" = "gunicorn" ]; then
|
if [ "${DJANGO_SETTINGS_MODULE:-}" = "config.settings.production" ] && [ "$1" = "gunicorn" ]; then
|
||||||
echo "Running Django deployment checks..."
|
echo "Running Django deployment checks..."
|
||||||
|
|||||||
189
tests/test_import_snapshots_command.py
Normal file
189
tests/test_import_snapshots_command.py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import date
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from django.core.management import call_command
|
||||||
|
|
||||||
|
from apps.competitions.models import Competition, Season
|
||||||
|
from apps.ingestion.models import ImportFile, ImportRun
|
||||||
|
from apps.players.models import Player
|
||||||
|
from apps.stats.models import PlayerSeason, PlayerSeasonStats
|
||||||
|
from apps.teams.models import Team
|
||||||
|
|
||||||
|
|
||||||
|
def _valid_payload() -> dict:
|
||||||
|
return {
|
||||||
|
"source_name": "official_site_feed",
|
||||||
|
"snapshot_date": "2026-03-13",
|
||||||
|
"records": [
|
||||||
|
{
|
||||||
|
"competition_external_id": "comp-nba",
|
||||||
|
"competition_name": "NBA",
|
||||||
|
"season": "2025-2026",
|
||||||
|
"team_external_id": "team-lal",
|
||||||
|
"team_name": "Los Angeles Lakers",
|
||||||
|
"player_external_id": "player-23",
|
||||||
|
"full_name": "LeBron James",
|
||||||
|
"first_name": "LeBron",
|
||||||
|
"last_name": "James",
|
||||||
|
"birth_date": "1984-12-30",
|
||||||
|
"nationality": "US",
|
||||||
|
"height_cm": 206,
|
||||||
|
"weight_kg": 113,
|
||||||
|
"position": "SF",
|
||||||
|
"role": "Primary Creator",
|
||||||
|
"games_played": 60,
|
||||||
|
"minutes_per_game": 34.5,
|
||||||
|
"points_per_game": 25.4,
|
||||||
|
"rebounds_per_game": 7.2,
|
||||||
|
"assists_per_game": 8.1,
|
||||||
|
"steals_per_game": 1.3,
|
||||||
|
"blocks_per_game": 0.7,
|
||||||
|
"turnovers_per_game": 3.2,
|
||||||
|
"fg_pct": 51.1,
|
||||||
|
"three_pt_pct": 38.4,
|
||||||
|
"ft_pct": 79.8,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _write_json(path: Path, payload: dict) -> None:
|
||||||
|
path.write_text(json.dumps(payload), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_valid_snapshot_import(tmp_path, settings):
|
||||||
|
incoming = tmp_path / "incoming"
|
||||||
|
archive = tmp_path / "archive"
|
||||||
|
failed = tmp_path / "failed"
|
||||||
|
incoming.mkdir()
|
||||||
|
archive.mkdir()
|
||||||
|
failed.mkdir()
|
||||||
|
|
||||||
|
payload = _valid_payload()
|
||||||
|
file_path = incoming / "nba-2026-03-13.json"
|
||||||
|
_write_json(file_path, payload)
|
||||||
|
|
||||||
|
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||||
|
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||||
|
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||||
|
|
||||||
|
call_command("import_snapshots")
|
||||||
|
|
||||||
|
run = ImportRun.objects.get()
|
||||||
|
assert run.status == ImportRun.RunStatus.SUCCESS
|
||||||
|
assert run.files_processed == 1
|
||||||
|
assert run.rows_upserted == 1
|
||||||
|
|
||||||
|
import_file = ImportFile.objects.get(import_run=run)
|
||||||
|
assert import_file.status == ImportFile.FileStatus.SUCCESS
|
||||||
|
assert import_file.source_name == "official_site_feed"
|
||||||
|
assert import_file.snapshot_date == date(2026, 3, 13)
|
||||||
|
|
||||||
|
assert (archive / "nba-2026-03-13.json").exists()
|
||||||
|
assert not (incoming / "nba-2026-03-13.json").exists()
|
||||||
|
|
||||||
|
assert Competition.objects.filter(source_uid="comp-nba").exists()
|
||||||
|
assert Team.objects.filter(source_uid="team-lal").exists()
|
||||||
|
assert Player.objects.filter(source_uid="player-23").exists()
|
||||||
|
assert Season.objects.filter(source_uid="season:2025-2026").exists()
|
||||||
|
assert PlayerSeason.objects.count() == 1
|
||||||
|
assert PlayerSeasonStats.objects.count() == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_invalid_snapshot_rejected_and_moved_to_failed(tmp_path, settings):
|
||||||
|
incoming = tmp_path / "incoming"
|
||||||
|
archive = tmp_path / "archive"
|
||||||
|
failed = tmp_path / "failed"
|
||||||
|
incoming.mkdir()
|
||||||
|
archive.mkdir()
|
||||||
|
failed.mkdir()
|
||||||
|
|
||||||
|
payload = _valid_payload()
|
||||||
|
del payload["records"][0]["points_per_game"]
|
||||||
|
file_path = incoming / "broken.json"
|
||||||
|
_write_json(file_path, payload)
|
||||||
|
|
||||||
|
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||||
|
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||||
|
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||||
|
|
||||||
|
call_command("import_snapshots")
|
||||||
|
|
||||||
|
run = ImportRun.objects.get()
|
||||||
|
assert run.status == ImportRun.RunStatus.FAILED
|
||||||
|
|
||||||
|
import_file = ImportFile.objects.get(import_run=run)
|
||||||
|
assert import_file.status == ImportFile.FileStatus.FAILED
|
||||||
|
assert "missing required fields" in import_file.error_message
|
||||||
|
|
||||||
|
assert (failed / "broken.json").exists()
|
||||||
|
assert not (archive / "broken.json").exists()
|
||||||
|
assert not Competition.objects.exists()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_idempotent_reimport_uses_checksum_and_skips_duplicate(tmp_path, settings):
|
||||||
|
incoming = tmp_path / "incoming"
|
||||||
|
archive = tmp_path / "archive"
|
||||||
|
failed = tmp_path / "failed"
|
||||||
|
incoming.mkdir()
|
||||||
|
archive.mkdir()
|
||||||
|
failed.mkdir()
|
||||||
|
|
||||||
|
payload = _valid_payload()
|
||||||
|
_write_json(incoming / "first.json", payload)
|
||||||
|
|
||||||
|
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||||
|
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||||
|
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||||
|
|
||||||
|
call_command("import_snapshots")
|
||||||
|
assert Competition.objects.count() == 1
|
||||||
|
assert Player.objects.count() == 1
|
||||||
|
|
||||||
|
# Re-drop same content with different filename.
|
||||||
|
_write_json(incoming / "first-duplicate.json", payload)
|
||||||
|
call_command("import_snapshots")
|
||||||
|
|
||||||
|
assert Competition.objects.count() == 1
|
||||||
|
assert Player.objects.count() == 1
|
||||||
|
assert PlayerSeason.objects.count() == 1
|
||||||
|
|
||||||
|
duplicate_file = ImportFile.objects.filter(relative_path="first-duplicate.json").order_by("-id").first()
|
||||||
|
assert duplicate_file is not None
|
||||||
|
assert duplicate_file.status == ImportFile.FileStatus.SKIPPED
|
||||||
|
assert duplicate_file.checksum
|
||||||
|
assert "duplicate checksum" in duplicate_file.error_message.lower()
|
||||||
|
|
||||||
|
assert (archive / "first-duplicate.json").exists()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_same_run_second_file_same_checksum_is_skipped(tmp_path, settings):
|
||||||
|
incoming = tmp_path / "incoming"
|
||||||
|
archive = tmp_path / "archive"
|
||||||
|
failed = tmp_path / "failed"
|
||||||
|
incoming.mkdir()
|
||||||
|
archive.mkdir()
|
||||||
|
failed.mkdir()
|
||||||
|
|
||||||
|
payload = _valid_payload()
|
||||||
|
_write_json(incoming / "a.json", payload)
|
||||||
|
_write_json(incoming / "b.json", payload)
|
||||||
|
|
||||||
|
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
|
||||||
|
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
|
||||||
|
settings.STATIC_DATASET_FAILED_DIR = str(failed)
|
||||||
|
|
||||||
|
call_command("import_snapshots")
|
||||||
|
|
||||||
|
files = {row.relative_path: row for row in ImportFile.objects.order_by("relative_path")}
|
||||||
|
assert files["a.json"].status == ImportFile.FileStatus.SUCCESS
|
||||||
|
assert files["b.json"].status == ImportFile.FileStatus.SKIPPED
|
||||||
|
assert files["a.json"].checksum == files["b.json"].checksum
|
||||||
Reference in New Issue
Block a user