Add v2 snapshot schema validation and import_snapshots command
This commit is contained in:
182
apps/ingestion/snapshots/schema.py
Normal file
182
apps/ingestion/snapshots/schema.py
Normal file
@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Any
|
||||
|
||||
from django.utils.dateparse import parse_date
|
||||
|
||||
REQUIRED_RECORD_FIELDS = {
|
||||
"competition_external_id",
|
||||
"competition_name",
|
||||
"season",
|
||||
"team_external_id",
|
||||
"team_name",
|
||||
"player_external_id",
|
||||
"full_name",
|
||||
"first_name",
|
||||
"last_name",
|
||||
"birth_date",
|
||||
"nationality",
|
||||
"height_cm",
|
||||
"weight_kg",
|
||||
"position",
|
||||
"games_played",
|
||||
"minutes_per_game",
|
||||
"points_per_game",
|
||||
"rebounds_per_game",
|
||||
"assists_per_game",
|
||||
"steals_per_game",
|
||||
"blocks_per_game",
|
||||
"turnovers_per_game",
|
||||
"fg_pct",
|
||||
"three_pt_pct",
|
||||
"ft_pct",
|
||||
}
|
||||
|
||||
ALLOWED_TOP_LEVEL_FIELDS = {
|
||||
"source_name",
|
||||
"snapshot_date",
|
||||
"records",
|
||||
"source_metadata",
|
||||
"raw_payload",
|
||||
}
|
||||
|
||||
ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | {
|
||||
"role",
|
||||
"source_metadata",
|
||||
"raw_payload",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapshotValidationResult:
|
||||
source_name: str
|
||||
snapshot_date: date
|
||||
records: list[dict[str, Any]]
|
||||
|
||||
|
||||
class SnapshotValidationError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class SnapshotSchemaValidator:
|
||||
"""Strict JSON schema validator for HoopScout v2 player-season snapshots."""
|
||||
|
||||
@staticmethod
|
||||
def _require_string(value: Any, field: str) -> str:
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
raise SnapshotValidationError(f"{field} must be a non-empty string")
|
||||
return value.strip()
|
||||
|
||||
@staticmethod
|
||||
def _require_non_negative_int(value: Any, field: str) -> int:
|
||||
if isinstance(value, bool):
|
||||
raise SnapshotValidationError(f"{field} must be a non-negative integer")
|
||||
try:
|
||||
parsed = int(value)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise SnapshotValidationError(f"{field} must be a non-negative integer") from exc
|
||||
if parsed < 0:
|
||||
raise SnapshotValidationError(f"{field} must be a non-negative integer")
|
||||
return parsed
|
||||
|
||||
@staticmethod
|
||||
def _require_float(value: Any, field: str) -> float:
|
||||
try:
|
||||
parsed = float(value)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise SnapshotValidationError(f"{field} must be numeric") from exc
|
||||
return parsed
|
||||
|
||||
@classmethod
|
||||
def _validate_record(cls, record: dict[str, Any], index: int) -> dict[str, Any]:
|
||||
unknown = set(record.keys()) - ALLOWED_RECORD_FIELDS
|
||||
if unknown:
|
||||
raise SnapshotValidationError(
|
||||
f"record[{index}] contains unknown fields: {', '.join(sorted(unknown))}"
|
||||
)
|
||||
|
||||
missing = REQUIRED_RECORD_FIELDS - set(record.keys())
|
||||
if missing:
|
||||
raise SnapshotValidationError(
|
||||
f"record[{index}] missing required fields: {', '.join(sorted(missing))}"
|
||||
)
|
||||
|
||||
normalized = dict(record)
|
||||
for field in (
|
||||
"competition_external_id",
|
||||
"competition_name",
|
||||
"season",
|
||||
"team_external_id",
|
||||
"team_name",
|
||||
"player_external_id",
|
||||
"full_name",
|
||||
"first_name",
|
||||
"last_name",
|
||||
"nationality",
|
||||
"position",
|
||||
):
|
||||
normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}")
|
||||
|
||||
if record.get("role") is not None:
|
||||
normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role")
|
||||
|
||||
birth_date = parse_date(str(record.get("birth_date")))
|
||||
if not birth_date:
|
||||
raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD")
|
||||
normalized["birth_date"] = birth_date.isoformat()
|
||||
|
||||
normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm")
|
||||
normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg")
|
||||
normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played")
|
||||
|
||||
for field in (
|
||||
"minutes_per_game",
|
||||
"points_per_game",
|
||||
"rebounds_per_game",
|
||||
"assists_per_game",
|
||||
"steals_per_game",
|
||||
"blocks_per_game",
|
||||
"turnovers_per_game",
|
||||
"fg_pct",
|
||||
"three_pt_pct",
|
||||
"ft_pct",
|
||||
):
|
||||
normalized[field] = cls._require_float(record.get(field), f"record[{index}].{field}")
|
||||
|
||||
return normalized
|
||||
|
||||
@classmethod
|
||||
def validate(cls, payload: dict[str, Any]) -> SnapshotValidationResult:
|
||||
if not isinstance(payload, dict):
|
||||
raise SnapshotValidationError("Snapshot root must be an object")
|
||||
|
||||
unknown = set(payload.keys()) - ALLOWED_TOP_LEVEL_FIELDS
|
||||
if unknown:
|
||||
raise SnapshotValidationError(
|
||||
f"Snapshot contains unknown top-level fields: {', '.join(sorted(unknown))}"
|
||||
)
|
||||
|
||||
source_name = cls._require_string(payload.get("source_name"), "source_name")
|
||||
|
||||
snapshot_date_raw = payload.get("snapshot_date")
|
||||
snapshot_date = parse_date(str(snapshot_date_raw))
|
||||
if not snapshot_date:
|
||||
raise SnapshotValidationError("snapshot_date must be YYYY-MM-DD")
|
||||
|
||||
records = payload.get("records")
|
||||
if not isinstance(records, list) or not records:
|
||||
raise SnapshotValidationError("records must be a non-empty array")
|
||||
|
||||
normalized_records: list[dict[str, Any]] = []
|
||||
for index, record in enumerate(records):
|
||||
if not isinstance(record, dict):
|
||||
raise SnapshotValidationError(f"record[{index}] must be an object")
|
||||
normalized_records.append(cls._validate_record(record, index=index))
|
||||
|
||||
return SnapshotValidationResult(
|
||||
source_name=source_name,
|
||||
snapshot_date=snapshot_date,
|
||||
records=normalized_records,
|
||||
)
|
||||
Reference in New Issue
Block a user