diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1b803bb..d105e9f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -63,6 +63,15 @@ git checkout -b feature/your-feature-name 3. Implement with focused commits and tests. 4. Open PR: `feature/*` -> `develop`. +## Running Tests (v2) + +Runtime images are intentionally lean and may not ship `pytest`. +Use the development compose stack and install dev dependencies before running tests: + +```bash +docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm web sh -lc "export PYTHONUSERBASE=/tmp/pyuser && python -m pip install --user -r requirements/dev.txt && python -m pytest -q" +``` + ## PR Checklist - [ ] Target branch is correct diff --git a/README.md b/README.md index 4e86025..901f852 100644 --- a/README.md +++ b/README.md @@ -156,11 +156,23 @@ Each file must be a JSON object: Validation is strict: - unknown fields are rejected -- required fields must exist -- `snapshot_date` and `birth_date` must be `YYYY-MM-DD` +- required fields must exist: + - `competition_external_id`, `competition_name`, `season` + - `team_external_id`, `team_name` + - `player_external_id`, `full_name` + - core stats (`games_played`, `minutes_per_game`, `points_per_game`, `rebounds_per_game`, `assists_per_game`, `steals_per_game`, `blocks_per_game`, `turnovers_per_game`, `fg_pct`, `three_pt_pct`, `ft_pct`) +- optional player bio/physical fields: + - `first_name`, `last_name`, `birth_date`, `nationality`, `height_cm`, `weight_kg`, `position`, `role` +- when `birth_date` is provided it must be `YYYY-MM-DD` - numeric fields must be numeric - invalid files are moved to failed directory +Importer enrichment note: +- `full_name` is source truth for identity display +- `first_name` / `last_name` are optional and may be absent in public snapshots +- when both are missing, importer may derive them from `full_name` as a best-effort enrichment step +- this enrichment is convenience-only and does not override source truth semantics + ## Import Command Run import: @@ -278,6 +290,7 @@ Notes: - extraction is intentionally low-frequency and uses retries conservatively - only public pages/endpoints should be targeted - emitted snapshots must match the same schema consumed by `import_snapshots` +- `public_json_snapshot` uses the same required-vs-optional field contract as `SnapshotSchemaValidator` (no stricter extractor-only required bio/physical fields) - optional scheduler container runs `scripts/scheduler.sh` loop using: - image: `registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}` - command: `/app/scripts/scheduler.sh` @@ -304,6 +317,7 @@ Notes: - season is configured by `EXTRACTOR_LBA_SEASON_LABEL` - parser supports payload keys: `records`, `data`, `players`, `items` - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`) +- public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields - no live HTTP calls in tests; tests use fixtures/mocked responses only ### BCL extractor assumptions and limitations (MVP) @@ -316,8 +330,19 @@ Notes: - season is configured by `EXTRACTOR_BCL_SEASON_LABEL` - parser supports payload keys: `records`, `data`, `players`, `items` - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`) +- public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields - no live HTTP calls in tests; tests use fixtures/mocked responses only +## Testing + +- runtime `web` image stays lean and may not include `pytest` tooling +- run tests with the development compose stack (or a dedicated test image/profile) and install dev dependencies first +- local example (one-off): + +```bash +docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm web sh -lc "export PYTHONUSERBASE=/tmp/pyuser && python -m pip install --user -r requirements/dev.txt && python -m pytest -q" +``` + ## Migration and Superuser Commands ```bash diff --git a/apps/ingestion/extractors/bcl.py b/apps/ingestion/extractors/bcl.py index 442d291..f91b101 100644 --- a/apps/ingestion/extractors/bcl.py +++ b/apps/ingestion/extractors/bcl.py @@ -16,6 +16,38 @@ def _first_non_empty(record: dict[str, Any], *keys: str) -> Any: return None +def _first_non_empty_text(record: dict[str, Any], *keys: str) -> str | None: + for key in keys: + value = record.get(key) + if isinstance(value, str): + stripped = value.strip() + if stripped: + return stripped + return None + + +ESSENTIAL_FIELDS = { + "competition_external_id", + "competition_name", + "season", + "team_external_id", + "team_name", + "player_external_id", + "full_name", + "games_played", + "minutes_per_game", + "points_per_game", + "rebounds_per_game", + "assists_per_game", + "steals_per_game", + "blocks_per_game", + "turnovers_per_game", + "fg_pct", + "three_pt_pct", + "ft_pct", +} + + class BCLSnapshotExtractor(BaseSnapshotExtractor): """ Basketball Champions League MVP extractor. @@ -86,7 +118,9 @@ class BCLSnapshotExtractor(BaseSnapshotExtractor): team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty( team_obj, "id", "team_id" ) - team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name") + team_name = _first_non_empty_text(source_record, "team_name", "team") or _first_non_empty_text( + team_obj, "name" + ) normalized = { "competition_external_id": self.competition_external_id, @@ -122,7 +156,7 @@ class BCLSnapshotExtractor(BaseSnapshotExtractor): "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"), } - missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + missing = [key for key in ESSENTIAL_FIELDS if normalized.get(key) in (None, "")] if missing: raise ExtractorNormalizationError(f"bcl row missing required fields: {', '.join(sorted(missing))}") diff --git a/apps/ingestion/extractors/lba.py b/apps/ingestion/extractors/lba.py index d2536b0..d62abdb 100644 --- a/apps/ingestion/extractors/lba.py +++ b/apps/ingestion/extractors/lba.py @@ -16,6 +16,38 @@ def _first_non_empty(record: dict[str, Any], *keys: str) -> Any: return None +def _first_non_empty_text(record: dict[str, Any], *keys: str) -> str | None: + for key in keys: + value = record.get(key) + if isinstance(value, str): + stripped = value.strip() + if stripped: + return stripped + return None + + +ESSENTIAL_FIELDS = { + "competition_external_id", + "competition_name", + "season", + "team_external_id", + "team_name", + "player_external_id", + "full_name", + "games_played", + "minutes_per_game", + "points_per_game", + "rebounds_per_game", + "assists_per_game", + "steals_per_game", + "blocks_per_game", + "turnovers_per_game", + "fg_pct", + "three_pt_pct", + "ft_pct", +} + + class LBASnapshotExtractor(BaseSnapshotExtractor): """ LBA (Lega Basket Serie A) MVP extractor. @@ -86,7 +118,9 @@ class LBASnapshotExtractor(BaseSnapshotExtractor): team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty( team_obj, "id", "team_id" ) - team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name") + team_name = _first_non_empty_text(source_record, "team_name", "team") or _first_non_empty_text( + team_obj, "name" + ) normalized = { "competition_external_id": self.competition_external_id, @@ -122,7 +156,7 @@ class LBASnapshotExtractor(BaseSnapshotExtractor): "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"), } - missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + missing = [key for key in ESSENTIAL_FIELDS if normalized.get(key) in (None, "")] if missing: raise ExtractorNormalizationError(f"lba row missing required fields: {', '.join(sorted(missing))}") diff --git a/apps/ingestion/extractors/public_json.py b/apps/ingestion/extractors/public_json.py index acb973d..4241d0b 100644 --- a/apps/ingestion/extractors/public_json.py +++ b/apps/ingestion/extractors/public_json.py @@ -4,6 +4,8 @@ from typing import Any from django.conf import settings +from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS + from .base import ( BaseSnapshotExtractor, ExtractorConfigError, @@ -113,7 +115,7 @@ class PublicJsonSnapshotExtractor(BaseSnapshotExtractor): "ft_pct": _first_non_empty(source_record, "ft_pct"), } - missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + missing = [key for key in REQUIRED_RECORD_FIELDS if normalized.get(key) in (None, "")] if missing: raise ExtractorNormalizationError( f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}" diff --git a/apps/ingestion/services/snapshot_import.py b/apps/ingestion/services/snapshot_import.py index d596013..8c860cb 100644 --- a/apps/ingestion/services/snapshot_import.py +++ b/apps/ingestion/services/snapshot_import.py @@ -62,6 +62,21 @@ def _parse_season_dates(label: str) -> tuple[date, date]: return date(year, 9, 1), date(year + 1, 7, 31) +def _parse_optional_birth_date(value: str | None) -> date | None: + if value in (None, ""): + return None + return parse_date(value) + + +def _split_name_parts(full_name: str) -> tuple[str, str]: + parts = full_name.strip().split(maxsplit=1) + if not parts: + return "", "" + if len(parts) == 1: + return parts[0], "" + return parts[0], parts[1] + + def _resolve_nationality(value: str | None) -> Nationality | None: if not value: return None @@ -152,10 +167,13 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d }, ) - position, _ = Position.objects.get_or_create( - code=_position_code(record["position"]), - defaults={"name": record["position"]}, - ) + position_value = record.get("position") + position = None + if position_value: + position, _ = Position.objects.get_or_create( + code=_position_code(position_value), + defaults={"name": position_value}, + ) role = None if record.get("role"): role, _ = Role.objects.get_or_create( @@ -163,19 +181,24 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d defaults={"name": record["role"]}, ) + first_name = record.get("first_name") or "" + last_name = record.get("last_name") or "" + if not first_name and not last_name: + first_name, last_name = _split_name_parts(record["full_name"]) + player, _ = Player.objects.update_or_create( source_name=source_key, source_uid=record["player_external_id"], defaults={ - "first_name": record["first_name"], - "last_name": record["last_name"], + "first_name": first_name, + "last_name": last_name, "full_name": record["full_name"], - "birth_date": parse_date(record["birth_date"]), + "birth_date": _parse_optional_birth_date(record.get("birth_date")), "nationality": _resolve_nationality(record.get("nationality")), "nominal_position": position, "inferred_role": role, - "height_cm": record["height_cm"], - "weight_kg": record["weight_kg"], + "height_cm": record.get("height_cm"), + "weight_kg": record.get("weight_kg"), "is_active": True, }, ) diff --git a/apps/ingestion/snapshots/schema.py b/apps/ingestion/snapshots/schema.py index 07943c3..a1e22f2 100644 --- a/apps/ingestion/snapshots/schema.py +++ b/apps/ingestion/snapshots/schema.py @@ -14,13 +14,6 @@ REQUIRED_RECORD_FIELDS = { "team_name", "player_external_id", "full_name", - "first_name", - "last_name", - "birth_date", - "nationality", - "height_cm", - "weight_kg", - "position", "games_played", "minutes_per_game", "points_per_game", @@ -34,6 +27,16 @@ REQUIRED_RECORD_FIELDS = { "ft_pct", } +OPTIONAL_RECORD_FIELDS = { + "first_name", + "last_name", + "birth_date", + "nationality", + "height_cm", + "weight_kg", + "position", +} + ALLOWED_TOP_LEVEL_FIELDS = { "source_name", "snapshot_date", @@ -42,7 +45,7 @@ ALLOWED_TOP_LEVEL_FIELDS = { "raw_payload", } -ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | { +ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | OPTIONAL_RECORD_FIELDS | { "role", "source_metadata", "raw_payload", @@ -69,6 +72,15 @@ class SnapshotSchemaValidator: raise SnapshotValidationError(f"{field} must be a non-empty string") return value.strip() + @staticmethod + def _optional_string(value: Any, field: str) -> str | None: + if value in (None, ""): + return None + if not isinstance(value, str): + raise SnapshotValidationError(f"{field} must be a string when provided") + stripped = value.strip() + return stripped or None + @staticmethod def _require_non_negative_int(value: Any, field: str) -> int: if isinstance(value, bool): @@ -81,6 +93,12 @@ class SnapshotSchemaValidator: raise SnapshotValidationError(f"{field} must be a non-negative integer") return parsed + @classmethod + def _optional_non_negative_int(cls, value: Any, field: str) -> int | None: + if value in (None, ""): + return None + return cls._require_non_negative_int(value, field) + @staticmethod def _require_float(value: Any, field: str) -> float: try: @@ -112,23 +130,26 @@ class SnapshotSchemaValidator: "team_name", "player_external_id", "full_name", - "first_name", - "last_name", - "nationality", - "position", ): normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}") + for field in ("first_name", "last_name", "nationality", "position"): + normalized[field] = cls._optional_string(record.get(field), f"record[{index}].{field}") + if record.get("role") is not None: normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role") - birth_date = parse_date(str(record.get("birth_date"))) - if not birth_date: - raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD") - normalized["birth_date"] = birth_date.isoformat() + birth_date_raw = record.get("birth_date") + if birth_date_raw in (None, ""): + normalized["birth_date"] = None + else: + birth_date = parse_date(str(birth_date_raw)) + if not birth_date: + raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD") + normalized["birth_date"] = birth_date.isoformat() - normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm") - normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg") + normalized["height_cm"] = cls._optional_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm") + normalized["weight_kg"] = cls._optional_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg") normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played") for field in ( diff --git a/tests/fixtures/bcl/bcl_players_stats_partial_public.json b/tests/fixtures/bcl/bcl_players_stats_partial_public.json new file mode 100644 index 0000000..a352d00 --- /dev/null +++ b/tests/fixtures/bcl/bcl_players_stats_partial_public.json @@ -0,0 +1,25 @@ +{ + "data": [ + { + "player": { + "id": "bcl-player-99", + "name": "Alex Novak" + }, + "team": { + "id": "bcl-team-tenerife", + "name": "Lenovo Tenerife" + }, + "gp": 10, + "mpg": 27.2, + "ppg": 14.8, + "rpg": 4.1, + "apg": 3.3, + "spg": 1.2, + "bpg": 0.4, + "tov": 2.0, + "fg_pct": 47.3, + "three_pct": 38.0, + "ft_pct": 79.1 + } + ] +} diff --git a/tests/fixtures/lba/lba_players_stats_partial_public.json b/tests/fixtures/lba/lba_players_stats_partial_public.json new file mode 100644 index 0000000..f436b1b --- /dev/null +++ b/tests/fixtures/lba/lba_players_stats_partial_public.json @@ -0,0 +1,25 @@ +{ + "data": [ + { + "player": { + "id": "p-002", + "name": "Andrea Bianchi" + }, + "team": { + "id": "team-olimpia-milano", + "name": "Olimpia Milano" + }, + "gp": 18, + "mpg": 24.7, + "ppg": 12.3, + "rpg": 2.9, + "apg": 4.2, + "spg": 1.1, + "bpg": 0.1, + "tov": 1.8, + "fg_pct": 45.0, + "three_pct": 35.4, + "ft_pct": 82.7 + } + ] +} diff --git a/tests/test_bcl_extractor.py b/tests/test_bcl_extractor.py index 5130d35..daa93dc 100644 --- a/tests/test_bcl_extractor.py +++ b/tests/test_bcl_extractor.py @@ -8,6 +8,7 @@ import pytest from django.core.management import call_command from apps.ingestion.extractors.bcl import BCLSnapshotExtractor +from apps.ingestion.extractors.base import ExtractorNormalizationError from apps.ingestion.extractors.registry import create_extractor @@ -51,6 +52,56 @@ def test_bcl_extractor_normalizes_fixture_payload(tmp_path, settings): assert row["three_pt_pct"] == 37.2 +@pytest.mark.django_db +def test_bcl_extractor_accepts_partial_public_player_bio_fields(tmp_path, settings): + settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" + settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl" + settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League" + + fixture_payload = _load_fixture("bcl/bcl_players_stats_partial_public.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = BCLSnapshotExtractor(http_client=FakeClient()) + output_path = tmp_path / "bcl-partial.json" + result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + payload = json.loads(output_path.read_text(encoding="utf-8")) + row = payload["records"][0] + assert row["full_name"] == "Alex Novak" + assert row["first_name"] is None + assert row["last_name"] is None + assert row["birth_date"] is None + assert row["nationality"] is None + assert row["height_cm"] is None + assert row["weight_kg"] is None + assert row["position"] is None + assert row["games_played"] == 10 + + +@pytest.mark.django_db +def test_bcl_extractor_still_fails_when_required_stats_are_missing(settings): + settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" + settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl" + settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League" + + fixture_payload = _load_fixture("bcl/bcl_players_stats_partial_public.json") + fixture_payload["data"][0].pop("ppg") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = BCLSnapshotExtractor(http_client=FakeClient()) + with pytest.raises(ExtractorNormalizationError): + extractor.run(write_output=False, snapshot_date=date(2026, 3, 13)) + + @pytest.mark.django_db def test_bcl_extractor_registry_selection(settings): settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" diff --git a/tests/test_extractors_framework.py b/tests/test_extractors_framework.py index cb0dd74..a63fcd9 100644 --- a/tests/test_extractors_framework.py +++ b/tests/test_extractors_framework.py @@ -7,8 +7,10 @@ import pytest from django.core.management import call_command from apps.ingestion.extractors.base import BaseSnapshotExtractor +from apps.ingestion.extractors.base import ExtractorNormalizationError from apps.ingestion.extractors.http import ResponsibleHttpClient from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor +from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS class DummyExtractor(BaseSnapshotExtractor): @@ -64,6 +66,29 @@ class _FakeResponse: return self._payload +def _minimal_public_json_record() -> dict: + return { + "competition_external_id": "comp-1", + "competition_name": "League One", + "season": "2025-2026", + "team_external_id": "team-1", + "team_name": "Team One", + "player_external_id": "player-1", + "full_name": "Jane Doe", + "games_played": 12, + "minutes_per_game": 27.2, + "points_per_game": 13.0, + "rebounds_per_game": 4.4, + "assists_per_game": 3.1, + "steals_per_game": 1.0, + "blocks_per_game": 0.3, + "turnovers_per_game": 1.8, + "fg_pct": 46.2, + "three_pt_pct": 35.5, + "ft_pct": 82.1, + } + + @pytest.mark.django_db def test_base_extractor_run_writes_snapshot_file(tmp_path, settings): settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming") @@ -135,6 +160,71 @@ def test_public_json_extractor_normalizes_common_field_aliases(tmp_path): assert row["three_pt_pct"] == 36.1 +@pytest.mark.django_db +def test_public_json_extractor_accepts_missing_optional_bio_and_physical_fields(tmp_path): + class FakeClient: + def get_json(self, *_args, **_kwargs): + return {"records": [_minimal_public_json_record()]} + + extractor = PublicJsonSnapshotExtractor( + url="https://example.com/public-feed.json", + source_name="test_public_feed", + http_client=FakeClient(), + ) + output_file = tmp_path / "public-optional.json" + result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + payload = json.loads(output_file.read_text(encoding="utf-8")) + row = payload["records"][0] + assert row["full_name"] == "Jane Doe" + assert row["first_name"] is None + assert row["last_name"] is None + assert row["birth_date"] is None + assert row["nationality"] is None + assert row["height_cm"] is None + assert row["weight_kg"] is None + assert row["position"] is None + assert row.get("role") is None + + +@pytest.mark.django_db +def test_public_json_extractor_fails_when_required_stat_missing(): + broken = _minimal_public_json_record() + broken.pop("points_per_game") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return {"records": [broken]} + + extractor = PublicJsonSnapshotExtractor( + url="https://example.com/public-feed.json", + source_name="test_public_feed", + http_client=FakeClient(), + ) + with pytest.raises(ExtractorNormalizationError): + extractor.run(write_output=False, snapshot_date=date(2026, 3, 13)) + + +@pytest.mark.django_db +@pytest.mark.parametrize("required_field", sorted(REQUIRED_RECORD_FIELDS)) +def test_public_json_required_fields_follow_snapshot_schema(required_field): + broken = _minimal_public_json_record() + broken.pop(required_field) + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return {"records": [broken]} + + extractor = PublicJsonSnapshotExtractor( + url="https://example.com/public-feed.json", + source_name="test_public_feed", + http_client=FakeClient(), + ) + with pytest.raises(ExtractorNormalizationError, match="missing required fields"): + extractor.run(write_output=False, snapshot_date=date(2026, 3, 13)) + + @pytest.mark.django_db def test_run_extractor_management_command_writes_snapshot(tmp_path, settings): settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json" diff --git a/tests/test_import_snapshots_command.py b/tests/test_import_snapshots_command.py index ce47446..b997a56 100644 --- a/tests/test_import_snapshots_command.py +++ b/tests/test_import_snapshots_command.py @@ -103,6 +103,116 @@ def test_valid_snapshot_import(tmp_path, settings): assert PlayerSeasonStats.objects.count() == 1 +@pytest.mark.django_db +def test_snapshot_import_succeeds_with_optional_bio_and_physical_fields_missing(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + for optional_field in ("first_name", "last_name", "birth_date", "nationality", "height_cm", "weight_kg", "position", "role"): + payload["records"][0].pop(optional_field, None) + + file_path = incoming / "optional-missing.json" + _write_json(file_path, payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + run = ImportRun.objects.get() + assert run.status == ImportRun.RunStatus.SUCCESS + player = Player.objects.get(source_uid="player-23") + assert player.first_name == "LeBron" + assert player.last_name == "James" + assert player.birth_date is None + assert player.nationality is None + assert player.nominal_position is None + assert player.height_cm is None + assert player.weight_kg is None + assert PlayerSeasonStats.objects.count() == 1 + + +@pytest.mark.django_db +def test_snapshot_import_preserves_single_name_part_without_forced_split(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + row = payload["records"][0] + row["first_name"] = "LeBron" + row.pop("last_name") + + file_path = incoming / "single-name-part.json" + _write_json(file_path, payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + run = ImportRun.objects.get() + assert run.status == ImportRun.RunStatus.SUCCESS + player = Player.objects.get(source_uid="player-23") + assert player.first_name == "LeBron" + assert player.last_name == "" + + +@pytest.mark.django_db +@pytest.mark.parametrize( + ("source_name", "competition_id", "competition_name"), + [ + ("lba", "lba-serie-a", "Lega Basket Serie A"), + ("bcl", "bcl", "Basketball Champions League"), + ], +) +def test_partial_public_source_snapshot_imports_for_lba_and_bcl( + tmp_path, + settings, + source_name, + competition_id, + competition_name, +): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload() + payload["source_name"] = source_name + row = payload["records"][0] + row["competition_external_id"] = competition_id + row["competition_name"] = competition_name + for optional_field in ("first_name", "last_name", "birth_date", "nationality", "height_cm", "weight_kg", "position", "role"): + row.pop(optional_field, None) + + _write_json(incoming / f"{source_name}.json", payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + run = ImportRun.objects.get() + assert run.status == ImportRun.RunStatus.SUCCESS + assert Competition.objects.filter(source_uid=competition_id, name=competition_name).exists() + assert Player.objects.filter(source_uid="player-23").exists() + assert PlayerSeasonStats.objects.count() == 1 + + @pytest.mark.django_db def test_invalid_snapshot_rejected_and_moved_to_failed(tmp_path, settings): incoming = tmp_path / "incoming" diff --git a/tests/test_lba_extractor.py b/tests/test_lba_extractor.py index aadc278..dece794 100644 --- a/tests/test_lba_extractor.py +++ b/tests/test_lba_extractor.py @@ -8,6 +8,7 @@ import pytest from django.core.management import call_command from apps.ingestion.extractors.lba import LBASnapshotExtractor +from apps.ingestion.extractors.base import ExtractorNormalizationError from apps.ingestion.extractors.registry import create_extractor @@ -51,6 +52,56 @@ def test_lba_extractor_normalizes_fixture_payload(tmp_path, settings): assert row["three_pt_pct"] == 36.5 +@pytest.mark.django_db +def test_lba_extractor_accepts_partial_public_player_bio_fields(tmp_path, settings): + settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json" + settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a" + settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A" + + fixture_payload = _load_fixture("lba/lba_players_stats_partial_public.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = LBASnapshotExtractor(http_client=FakeClient()) + output_path = tmp_path / "lba-partial.json" + result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + payload = json.loads(output_path.read_text(encoding="utf-8")) + row = payload["records"][0] + assert row["full_name"] == "Andrea Bianchi" + assert row["first_name"] is None + assert row["last_name"] is None + assert row["birth_date"] is None + assert row["nationality"] is None + assert row["height_cm"] is None + assert row["weight_kg"] is None + assert row["position"] is None + assert row["games_played"] == 18 + + +@pytest.mark.django_db +def test_lba_extractor_still_fails_when_required_stats_are_missing(settings): + settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json" + settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a" + settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A" + + fixture_payload = _load_fixture("lba/lba_players_stats_partial_public.json") + fixture_payload["data"][0].pop("ppg") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = LBASnapshotExtractor(http_client=FakeClient()) + with pytest.raises(ExtractorNormalizationError): + extractor.run(write_output=False, snapshot_date=date(2026, 3, 13)) + + @pytest.mark.django_db def test_lba_extractor_registry_selection(settings): settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"