diff --git a/README.md b/README.md index 4e86025..4682c67 100644 --- a/README.md +++ b/README.md @@ -156,8 +156,14 @@ Each file must be a JSON object: Validation is strict: - unknown fields are rejected -- required fields must exist -- `snapshot_date` and `birth_date` must be `YYYY-MM-DD` +- required fields must exist: + - `competition_external_id`, `competition_name`, `season` + - `team_external_id`, `team_name` + - `player_external_id`, `full_name` + - core stats (`games_played`, `minutes_per_game`, `points_per_game`, `rebounds_per_game`, `assists_per_game`, `steals_per_game`, `blocks_per_game`, `turnovers_per_game`, `fg_pct`, `three_pt_pct`, `ft_pct`) +- optional player bio/physical fields: + - `first_name`, `last_name`, `birth_date`, `nationality`, `height_cm`, `weight_kg`, `position`, `role` +- when `birth_date` is provided it must be `YYYY-MM-DD` - numeric fields must be numeric - invalid files are moved to failed directory @@ -304,6 +310,7 @@ Notes: - season is configured by `EXTRACTOR_LBA_SEASON_LABEL` - parser supports payload keys: `records`, `data`, `players`, `items` - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`) +- public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields - no live HTTP calls in tests; tests use fixtures/mocked responses only ### BCL extractor assumptions and limitations (MVP) @@ -316,6 +323,7 @@ Notes: - season is configured by `EXTRACTOR_BCL_SEASON_LABEL` - parser supports payload keys: `records`, `data`, `players`, `items` - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`) +- public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields - no live HTTP calls in tests; tests use fixtures/mocked responses only ## Migration and Superuser Commands diff --git a/apps/ingestion/extractors/bcl.py b/apps/ingestion/extractors/bcl.py index 442d291..bd50094 100644 --- a/apps/ingestion/extractors/bcl.py +++ b/apps/ingestion/extractors/bcl.py @@ -16,6 +16,28 @@ def _first_non_empty(record: dict[str, Any], *keys: str) -> Any: return None +ESSENTIAL_FIELDS = { + "competition_external_id", + "competition_name", + "season", + "team_external_id", + "team_name", + "player_external_id", + "full_name", + "games_played", + "minutes_per_game", + "points_per_game", + "rebounds_per_game", + "assists_per_game", + "steals_per_game", + "blocks_per_game", + "turnovers_per_game", + "fg_pct", + "three_pt_pct", + "ft_pct", +} + + class BCLSnapshotExtractor(BaseSnapshotExtractor): """ Basketball Champions League MVP extractor. @@ -122,7 +144,7 @@ class BCLSnapshotExtractor(BaseSnapshotExtractor): "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"), } - missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + missing = [key for key in ESSENTIAL_FIELDS if normalized.get(key) in (None, "")] if missing: raise ExtractorNormalizationError(f"bcl row missing required fields: {', '.join(sorted(missing))}") diff --git a/apps/ingestion/extractors/lba.py b/apps/ingestion/extractors/lba.py index d2536b0..97347a5 100644 --- a/apps/ingestion/extractors/lba.py +++ b/apps/ingestion/extractors/lba.py @@ -16,6 +16,28 @@ def _first_non_empty(record: dict[str, Any], *keys: str) -> Any: return None +ESSENTIAL_FIELDS = { + "competition_external_id", + "competition_name", + "season", + "team_external_id", + "team_name", + "player_external_id", + "full_name", + "games_played", + "minutes_per_game", + "points_per_game", + "rebounds_per_game", + "assists_per_game", + "steals_per_game", + "blocks_per_game", + "turnovers_per_game", + "fg_pct", + "three_pt_pct", + "ft_pct", +} + + class LBASnapshotExtractor(BaseSnapshotExtractor): """ LBA (Lega Basket Serie A) MVP extractor. @@ -122,7 +144,7 @@ class LBASnapshotExtractor(BaseSnapshotExtractor): "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"), } - missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + missing = [key for key in ESSENTIAL_FIELDS if normalized.get(key) in (None, "")] if missing: raise ExtractorNormalizationError(f"lba row missing required fields: {', '.join(sorted(missing))}") diff --git a/apps/ingestion/services/snapshot_import.py b/apps/ingestion/services/snapshot_import.py index d596013..8c860cb 100644 --- a/apps/ingestion/services/snapshot_import.py +++ b/apps/ingestion/services/snapshot_import.py @@ -62,6 +62,21 @@ def _parse_season_dates(label: str) -> tuple[date, date]: return date(year, 9, 1), date(year + 1, 7, 31) +def _parse_optional_birth_date(value: str | None) -> date | None: + if value in (None, ""): + return None + return parse_date(value) + + +def _split_name_parts(full_name: str) -> tuple[str, str]: + parts = full_name.strip().split(maxsplit=1) + if not parts: + return "", "" + if len(parts) == 1: + return parts[0], "" + return parts[0], parts[1] + + def _resolve_nationality(value: str | None) -> Nationality | None: if not value: return None @@ -152,10 +167,13 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d }, ) - position, _ = Position.objects.get_or_create( - code=_position_code(record["position"]), - defaults={"name": record["position"]}, - ) + position_value = record.get("position") + position = None + if position_value: + position, _ = Position.objects.get_or_create( + code=_position_code(position_value), + defaults={"name": position_value}, + ) role = None if record.get("role"): role, _ = Role.objects.get_or_create( @@ -163,19 +181,24 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d defaults={"name": record["role"]}, ) + first_name = record.get("first_name") or "" + last_name = record.get("last_name") or "" + if not first_name and not last_name: + first_name, last_name = _split_name_parts(record["full_name"]) + player, _ = Player.objects.update_or_create( source_name=source_key, source_uid=record["player_external_id"], defaults={ - "first_name": record["first_name"], - "last_name": record["last_name"], + "first_name": first_name, + "last_name": last_name, "full_name": record["full_name"], - "birth_date": parse_date(record["birth_date"]), + "birth_date": _parse_optional_birth_date(record.get("birth_date")), "nationality": _resolve_nationality(record.get("nationality")), "nominal_position": position, "inferred_role": role, - "height_cm": record["height_cm"], - "weight_kg": record["weight_kg"], + "height_cm": record.get("height_cm"), + "weight_kg": record.get("weight_kg"), "is_active": True, }, ) diff --git a/apps/ingestion/snapshots/schema.py b/apps/ingestion/snapshots/schema.py index 07943c3..a1e22f2 100644 --- a/apps/ingestion/snapshots/schema.py +++ b/apps/ingestion/snapshots/schema.py @@ -14,13 +14,6 @@ REQUIRED_RECORD_FIELDS = { "team_name", "player_external_id", "full_name", - "first_name", - "last_name", - "birth_date", - "nationality", - "height_cm", - "weight_kg", - "position", "games_played", "minutes_per_game", "points_per_game", @@ -34,6 +27,16 @@ REQUIRED_RECORD_FIELDS = { "ft_pct", } +OPTIONAL_RECORD_FIELDS = { + "first_name", + "last_name", + "birth_date", + "nationality", + "height_cm", + "weight_kg", + "position", +} + ALLOWED_TOP_LEVEL_FIELDS = { "source_name", "snapshot_date", @@ -42,7 +45,7 @@ ALLOWED_TOP_LEVEL_FIELDS = { "raw_payload", } -ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | { +ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | OPTIONAL_RECORD_FIELDS | { "role", "source_metadata", "raw_payload", @@ -69,6 +72,15 @@ class SnapshotSchemaValidator: raise SnapshotValidationError(f"{field} must be a non-empty string") return value.strip() + @staticmethod + def _optional_string(value: Any, field: str) -> str | None: + if value in (None, ""): + return None + if not isinstance(value, str): + raise SnapshotValidationError(f"{field} must be a string when provided") + stripped = value.strip() + return stripped or None + @staticmethod def _require_non_negative_int(value: Any, field: str) -> int: if isinstance(value, bool): @@ -81,6 +93,12 @@ class SnapshotSchemaValidator: raise SnapshotValidationError(f"{field} must be a non-negative integer") return parsed + @classmethod + def _optional_non_negative_int(cls, value: Any, field: str) -> int | None: + if value in (None, ""): + return None + return cls._require_non_negative_int(value, field) + @staticmethod def _require_float(value: Any, field: str) -> float: try: @@ -112,23 +130,26 @@ class SnapshotSchemaValidator: "team_name", "player_external_id", "full_name", - "first_name", - "last_name", - "nationality", - "position", ): normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}") + for field in ("first_name", "last_name", "nationality", "position"): + normalized[field] = cls._optional_string(record.get(field), f"record[{index}].{field}") + if record.get("role") is not None: normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role") - birth_date = parse_date(str(record.get("birth_date"))) - if not birth_date: - raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD") - normalized["birth_date"] = birth_date.isoformat() + birth_date_raw = record.get("birth_date") + if birth_date_raw in (None, ""): + normalized["birth_date"] = None + else: + birth_date = parse_date(str(birth_date_raw)) + if not birth_date: + raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD") + normalized["birth_date"] = birth_date.isoformat() - normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm") - normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg") + normalized["height_cm"] = cls._optional_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm") + normalized["weight_kg"] = cls._optional_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg") normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played") for field in ( diff --git a/tests/fixtures/bcl/bcl_players_stats_partial_public.json b/tests/fixtures/bcl/bcl_players_stats_partial_public.json new file mode 100644 index 0000000..a352d00 --- /dev/null +++ b/tests/fixtures/bcl/bcl_players_stats_partial_public.json @@ -0,0 +1,25 @@ +{ + "data": [ + { + "player": { + "id": "bcl-player-99", + "name": "Alex Novak" + }, + "team": { + "id": "bcl-team-tenerife", + "name": "Lenovo Tenerife" + }, + "gp": 10, + "mpg": 27.2, + "ppg": 14.8, + "rpg": 4.1, + "apg": 3.3, + "spg": 1.2, + "bpg": 0.4, + "tov": 2.0, + "fg_pct": 47.3, + "three_pct": 38.0, + "ft_pct": 79.1 + } + ] +} diff --git a/tests/fixtures/lba/lba_players_stats_partial_public.json b/tests/fixtures/lba/lba_players_stats_partial_public.json new file mode 100644 index 0000000..f436b1b --- /dev/null +++ b/tests/fixtures/lba/lba_players_stats_partial_public.json @@ -0,0 +1,25 @@ +{ + "data": [ + { + "player": { + "id": "p-002", + "name": "Andrea Bianchi" + }, + "team": { + "id": "team-olimpia-milano", + "name": "Olimpia Milano" + }, + "gp": 18, + "mpg": 24.7, + "ppg": 12.3, + "rpg": 2.9, + "apg": 4.2, + "spg": 1.1, + "bpg": 0.1, + "tov": 1.8, + "fg_pct": 45.0, + "three_pct": 35.4, + "ft_pct": 82.7 + } + ] +} diff --git a/tests/test_bcl_extractor.py b/tests/test_bcl_extractor.py index 5130d35..b5795fe 100644 --- a/tests/test_bcl_extractor.py +++ b/tests/test_bcl_extractor.py @@ -51,6 +51,37 @@ def test_bcl_extractor_normalizes_fixture_payload(tmp_path, settings): assert row["three_pt_pct"] == 37.2 +@pytest.mark.django_db +def test_bcl_extractor_accepts_partial_public_player_bio_fields(tmp_path, settings): + settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" + settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl" + settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League" + + fixture_payload = _load_fixture("bcl/bcl_players_stats_partial_public.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = BCLSnapshotExtractor(http_client=FakeClient()) + output_path = tmp_path / "bcl-partial.json" + result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + payload = json.loads(output_path.read_text(encoding="utf-8")) + row = payload["records"][0] + assert row["full_name"] == "Alex Novak" + assert row["first_name"] is None + assert row["last_name"] is None + assert row["birth_date"] is None + assert row["nationality"] is None + assert row["height_cm"] is None + assert row["weight_kg"] is None + assert row["position"] is None + assert row["games_played"] == 10 + + @pytest.mark.django_db def test_bcl_extractor_registry_selection(settings): settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" diff --git a/tests/test_import_snapshots_command.py b/tests/test_import_snapshots_command.py index ce47446..83b4125 100644 --- a/tests/test_import_snapshots_command.py +++ b/tests/test_import_snapshots_command.py @@ -103,6 +103,86 @@ def test_valid_snapshot_import(tmp_path, settings): assert PlayerSeasonStats.objects.count() == 1 +@pytest.mark.django_db +def test_snapshot_import_succeeds_with_optional_bio_and_physical_fields_missing(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + for optional_field in ("first_name", "last_name", "birth_date", "nationality", "height_cm", "weight_kg", "position", "role"): + payload["records"][0].pop(optional_field, None) + + file_path = incoming / "optional-missing.json" + _write_json(file_path, payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + run = ImportRun.objects.get() + assert run.status == ImportRun.RunStatus.SUCCESS + player = Player.objects.get(source_uid="player-23") + assert player.first_name == "LeBron" + assert player.last_name == "James" + assert player.birth_date is None + assert player.nationality is None + assert player.nominal_position is None + assert player.height_cm is None + assert player.weight_kg is None + assert PlayerSeasonStats.objects.count() == 1 + + +@pytest.mark.django_db +@pytest.mark.parametrize( + ("source_name", "competition_id", "competition_name"), + [ + ("lba", "lba-serie-a", "Lega Basket Serie A"), + ("bcl", "bcl", "Basketball Champions League"), + ], +) +def test_partial_public_source_snapshot_imports_for_lba_and_bcl( + tmp_path, + settings, + source_name, + competition_id, + competition_name, +): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + payload["source_name"] = source_name + row = payload["records"][0] + row["competition_external_id"] = competition_id + row["competition_name"] = competition_name + for optional_field in ("first_name", "last_name", "birth_date", "nationality", "height_cm", "weight_kg", "position", "role"): + row.pop(optional_field, None) + + _write_json(incoming / f"{source_name}.json", payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + run = ImportRun.objects.get() + assert run.status == ImportRun.RunStatus.SUCCESS + assert Competition.objects.filter(source_uid=competition_id, name=competition_name).exists() + assert Player.objects.filter(source_uid="player-23").exists() + assert PlayerSeasonStats.objects.count() == 1 + + @pytest.mark.django_db def test_invalid_snapshot_rejected_and_moved_to_failed(tmp_path, settings): incoming = tmp_path / "incoming" diff --git a/tests/test_lba_extractor.py b/tests/test_lba_extractor.py index aadc278..dece794 100644 --- a/tests/test_lba_extractor.py +++ b/tests/test_lba_extractor.py @@ -8,6 +8,7 @@ import pytest from django.core.management import call_command from apps.ingestion.extractors.lba import LBASnapshotExtractor +from apps.ingestion.extractors.base import ExtractorNormalizationError from apps.ingestion.extractors.registry import create_extractor @@ -51,6 +52,56 @@ def test_lba_extractor_normalizes_fixture_payload(tmp_path, settings): assert row["three_pt_pct"] == 36.5 +@pytest.mark.django_db +def test_lba_extractor_accepts_partial_public_player_bio_fields(tmp_path, settings): + settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json" + settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a" + settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A" + + fixture_payload = _load_fixture("lba/lba_players_stats_partial_public.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = LBASnapshotExtractor(http_client=FakeClient()) + output_path = tmp_path / "lba-partial.json" + result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + payload = json.loads(output_path.read_text(encoding="utf-8")) + row = payload["records"][0] + assert row["full_name"] == "Andrea Bianchi" + assert row["first_name"] is None + assert row["last_name"] is None + assert row["birth_date"] is None + assert row["nationality"] is None + assert row["height_cm"] is None + assert row["weight_kg"] is None + assert row["position"] is None + assert row["games_played"] == 18 + + +@pytest.mark.django_db +def test_lba_extractor_still_fails_when_required_stats_are_missing(settings): + settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json" + settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a" + settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A" + + fixture_payload = _load_fixture("lba/lba_players_stats_partial_public.json") + fixture_payload["data"][0].pop("ppg") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = LBASnapshotExtractor(http_client=FakeClient()) + with pytest.raises(ExtractorNormalizationError): + extractor.run(write_output=False, snapshot_date=date(2026, 3, 13)) + + @pytest.mark.django_db def test_lba_extractor_registry_selection(settings): settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"