Merge branch 'feature/v2-public-source-schema-relaxation' into feature/hoopscout-v2-static-architecture

2026-03-20 15:51:09 +01:00
parent 1aad6945c7 3f811827de
commit 5a19587376
13 changed files with 534 additions and 34 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -63,6 +63,15 @@ git checkout -b feature/your-feature-name
 3. Implement with focused commits and tests.
 4. Open PR: `feature/*` -> `develop`.
 ## Running Tests (v2)
 Runtime images are intentionally lean and may not ship `pytest`.
 Use the development compose stack and install dev dependencies before running tests:
 ```bash
 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm web sh -lc "export PYTHONUSERBASE=/tmp/pyuser && python -m pip install --user -r requirements/dev.txt && python -m pytest -q"
 ```
 ## PR Checklist
 - [ ] Target branch is correct
--- a/README.md
+++ b/README.md
@@ -156,11 +156,23 @@ Each file must be a JSON object:
 Validation is strict:
 - unknown fields are rejected
- required fields must exist
+- required fields must exist:
- `snapshot_date` and `birth_date` must be `YYYY-MM-DD`
+  - `competition_external_id`, `competition_name`, `season`
  - `team_external_id`, `team_name`
  - `player_external_id`, `full_name`
  - core stats (`games_played`, `minutes_per_game`, `points_per_game`, `rebounds_per_game`, `assists_per_game`, `steals_per_game`, `blocks_per_game`, `turnovers_per_game`, `fg_pct`, `three_pt_pct`, `ft_pct`)
 - optional player bio/physical fields:
  - `first_name`, `last_name`, `birth_date`, `nationality`, `height_cm`, `weight_kg`, `position`, `role`
 - when `birth_date` is provided it must be `YYYY-MM-DD`
 - numeric fields must be numeric
 - invalid files are moved to failed directory
 Importer enrichment note:
 - `full_name` is source truth for identity display
 - `first_name` / `last_name` are optional and may be absent in public snapshots
 - when both are missing, importer may derive them from `full_name` as a best-effort enrichment step
 - this enrichment is convenience-only and does not override source truth semantics
 ## Import Command
 Run import:
@@ -278,6 +290,7 @@ Notes:
 - extraction is intentionally low-frequency and uses retries conservatively
 - only public pages/endpoints should be targeted
 - emitted snapshots must match the same schema consumed by `import_snapshots`
 - `public_json_snapshot` uses the same required-vs-optional field contract as `SnapshotSchemaValidator` (no stricter extractor-only required bio/physical fields)
 - optional scheduler container runs `scripts/scheduler.sh` loop using:
  - image: `registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}`
  - command: `/app/scripts/scheduler.sh`
@@ -304,6 +317,7 @@ Notes:
 - season is configured by `EXTRACTOR_LBA_SEASON_LABEL`
 - parser supports payload keys: `records`, `data`, `players`, `items`
 - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`)
 - public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields
 - no live HTTP calls in tests; tests use fixtures/mocked responses only
 ### BCL extractor assumptions and limitations (MVP)
@@ -316,8 +330,19 @@ Notes:
 - season is configured by `EXTRACTOR_BCL_SEASON_LABEL`
 - parser supports payload keys: `records`, `data`, `players`, `items`
 - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`)
 - public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields
 - no live HTTP calls in tests; tests use fixtures/mocked responses only
 ## Testing
 - runtime `web` image stays lean and may not include `pytest` tooling
 - run tests with the development compose stack (or a dedicated test image/profile) and install dev dependencies first
 - local example (one-off):
 ```bash
 docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm web sh -lc "export PYTHONUSERBASE=/tmp/pyuser && python -m pip install --user -r requirements/dev.txt && python -m pytest -q"
 ```
 ## Migration and Superuser Commands
 ```bash
--- a/apps/ingestion/extractors/bcl.py
+++ b/apps/ingestion/extractors/bcl.py
@@ -16,6 +16,38 @@ def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
    return None
 def _first_non_empty_text(record: dict[str, Any], *keys: str) -> str | None:
    for key in keys:
        value = record.get(key)
        if isinstance(value, str):
            stripped = value.strip()
            if stripped:
                return stripped
    return None
 ESSENTIAL_FIELDS = {
    "competition_external_id",
    "competition_name",
    "season",
    "team_external_id",
    "team_name",
    "player_external_id",
    "full_name",
    "games_played",
    "minutes_per_game",
    "points_per_game",
    "rebounds_per_game",
    "assists_per_game",
    "steals_per_game",
    "blocks_per_game",
    "turnovers_per_game",
    "fg_pct",
    "three_pt_pct",
    "ft_pct",
 }
 class BCLSnapshotExtractor(BaseSnapshotExtractor):
    """
    Basketball Champions League MVP extractor.
@@ -86,7 +118,9 @@ class BCLSnapshotExtractor(BaseSnapshotExtractor):
        team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty(
            team_obj, "id", "team_id"
        )
-        team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name")
+        team_name = _first_non_empty_text(source_record, "team_name", "team") or _first_non_empty_text(
            team_obj, "name"
        )
        normalized = {
            "competition_external_id": self.competition_external_id,
@@ -122,7 +156,7 @@ class BCLSnapshotExtractor(BaseSnapshotExtractor):
            "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"),
        }
-        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
+        missing = [key for key in ESSENTIAL_FIELDS if normalized.get(key) in (None, "")]
        if missing:
            raise ExtractorNormalizationError(f"bcl row missing required fields: {', '.join(sorted(missing))}")
--- a/apps/ingestion/extractors/lba.py
+++ b/apps/ingestion/extractors/lba.py
@@ -16,6 +16,38 @@ def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
    return None
 def _first_non_empty_text(record: dict[str, Any], *keys: str) -> str | None:
    for key in keys:
        value = record.get(key)
        if isinstance(value, str):
            stripped = value.strip()
            if stripped:
                return stripped
    return None
 ESSENTIAL_FIELDS = {
    "competition_external_id",
    "competition_name",
    "season",
    "team_external_id",
    "team_name",
    "player_external_id",
    "full_name",
    "games_played",
    "minutes_per_game",
    "points_per_game",
    "rebounds_per_game",
    "assists_per_game",
    "steals_per_game",
    "blocks_per_game",
    "turnovers_per_game",
    "fg_pct",
    "three_pt_pct",
    "ft_pct",
 }
 class LBASnapshotExtractor(BaseSnapshotExtractor):
    """
    LBA (Lega Basket Serie A) MVP extractor.
@@ -86,7 +118,9 @@ class LBASnapshotExtractor(BaseSnapshotExtractor):
        team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty(
            team_obj, "id", "team_id"
        )
-        team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name")
+        team_name = _first_non_empty_text(source_record, "team_name", "team") or _first_non_empty_text(
            team_obj, "name"
        )
        normalized = {
            "competition_external_id": self.competition_external_id,
@@ -122,7 +156,7 @@ class LBASnapshotExtractor(BaseSnapshotExtractor):
            "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"),
        }
-        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
+        missing = [key for key in ESSENTIAL_FIELDS if normalized.get(key) in (None, "")]
        if missing:
            raise ExtractorNormalizationError(f"lba row missing required fields: {', '.join(sorted(missing))}")
--- a/apps/ingestion/extractors/public_json.py
+++ b/apps/ingestion/extractors/public_json.py
@@ -4,6 +4,8 @@ from typing import Any
 from django.conf import settings
 from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS
 from .base import (
    BaseSnapshotExtractor,
    ExtractorConfigError,
@@ -113,7 +115,7 @@ class PublicJsonSnapshotExtractor(BaseSnapshotExtractor):
            "ft_pct": _first_non_empty(source_record, "ft_pct"),
        }
-        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
+        missing = [key for key in REQUIRED_RECORD_FIELDS if normalized.get(key) in (None, "")]
        if missing:
            raise ExtractorNormalizationError(
                f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}"
--- a/apps/ingestion/services/snapshot_import.py
+++ b/apps/ingestion/services/snapshot_import.py
@@ -62,6 +62,21 @@ def _parse_season_dates(label: str) -> tuple[date, date]:
    return date(year, 9, 1), date(year + 1, 7, 31)
 def _parse_optional_birth_date(value: str | None) -> date | None:
    if value in (None, ""):
        return None
    return parse_date(value)
 def _split_name_parts(full_name: str) -> tuple[str, str]:
    parts = full_name.strip().split(maxsplit=1)
    if not parts:
        return "", ""
    if len(parts) == 1:
        return parts[0], ""
    return parts[0], parts[1]
 def _resolve_nationality(value: str | None) -> Nationality | None:
    if not value:
        return None
@@ -152,9 +167,12 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d
        },
    )
    position_value = record.get("position")
    position = None
    if position_value:
        position, _ = Position.objects.get_or_create(
-        code=_position_code(record["position"]),
+            code=_position_code(position_value),
-        defaults={"name": record["position"]},
+            defaults={"name": position_value},
        )
    role = None
    if record.get("role"):
@@ -163,19 +181,24 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d
            defaults={"name": record["role"]},
        )
    first_name = record.get("first_name") or ""
    last_name = record.get("last_name") or ""
    if not first_name and not last_name:
        first_name, last_name = _split_name_parts(record["full_name"])
    player, _ = Player.objects.update_or_create(
        source_name=source_key,
        source_uid=record["player_external_id"],
        defaults={
-            "first_name": record["first_name"],
+            "first_name": first_name,
-            "last_name": record["last_name"],
+            "last_name": last_name,
            "full_name": record["full_name"],
-            "birth_date": parse_date(record["birth_date"]),
+            "birth_date": _parse_optional_birth_date(record.get("birth_date")),
            "nationality": _resolve_nationality(record.get("nationality")),
            "nominal_position": position,
            "inferred_role": role,
-            "height_cm": record["height_cm"],
+            "height_cm": record.get("height_cm"),
-            "weight_kg": record["weight_kg"],
+            "weight_kg": record.get("weight_kg"),
            "is_active": True,
        },
    )
--- a/apps/ingestion/snapshots/schema.py
+++ b/apps/ingestion/snapshots/schema.py
@@ -14,13 +14,6 @@ REQUIRED_RECORD_FIELDS = {
    "team_name",
    "player_external_id",
    "full_name",
    "first_name",
    "last_name",
    "birth_date",
    "nationality",
    "height_cm",
    "weight_kg",
    "position",
    "games_played",
    "minutes_per_game",
    "points_per_game",
@@ -34,6 +27,16 @@ REQUIRED_RECORD_FIELDS = {
    "ft_pct",
 }
 OPTIONAL_RECORD_FIELDS = {
    "first_name",
    "last_name",
    "birth_date",
    "nationality",
    "height_cm",
    "weight_kg",
    "position",
 }
 ALLOWED_TOP_LEVEL_FIELDS = {
    "source_name",
    "snapshot_date",
@@ -42,7 +45,7 @@ ALLOWED_TOP_LEVEL_FIELDS = {
    "raw_payload",
 }
-ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | {
+ALLOWED_RECORD_FIELDS = REQUIRED_RECORD_FIELDS | OPTIONAL_RECORD_FIELDS | {
    "role",
    "source_metadata",
    "raw_payload",
@@ -69,6 +72,15 @@ class SnapshotSchemaValidator:
            raise SnapshotValidationError(f"{field} must be a non-empty string")
        return value.strip()
    @staticmethod
    def _optional_string(value: Any, field: str) -> str | None:
        if value in (None, ""):
            return None
        if not isinstance(value, str):
            raise SnapshotValidationError(f"{field} must be a string when provided")
        stripped = value.strip()
        return stripped or None
    @staticmethod
    def _require_non_negative_int(value: Any, field: str) -> int:
        if isinstance(value, bool):
@@ -81,6 +93,12 @@ class SnapshotSchemaValidator:
            raise SnapshotValidationError(f"{field} must be a non-negative integer")
        return parsed
    @classmethod
    def _optional_non_negative_int(cls, value: Any, field: str) -> int | None:
        if value in (None, ""):
            return None
        return cls._require_non_negative_int(value, field)
    @staticmethod
    def _require_float(value: Any, field: str) -> float:
        try:
@@ -112,23 +130,26 @@ class SnapshotSchemaValidator:
            "team_name",
            "player_external_id",
            "full_name",
            "first_name",
            "last_name",
            "nationality",
            "position",
        ):
            normalized[field] = cls._require_string(record.get(field), f"record[{index}].{field}")
        for field in ("first_name", "last_name", "nationality", "position"):
            normalized[field] = cls._optional_string(record.get(field), f"record[{index}].{field}")
        if record.get("role") is not None:
            normalized["role"] = cls._require_string(record.get("role"), f"record[{index}].role")
-        birth_date = parse_date(str(record.get("birth_date")))
+        birth_date_raw = record.get("birth_date")
        if birth_date_raw in (None, ""):
            normalized["birth_date"] = None
        else:
            birth_date = parse_date(str(birth_date_raw))
            if not birth_date:
                raise SnapshotValidationError(f"record[{index}].birth_date must be YYYY-MM-DD")
            normalized["birth_date"] = birth_date.isoformat()
-        normalized["height_cm"] = cls._require_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm")
+        normalized["height_cm"] = cls._optional_non_negative_int(record.get("height_cm"), f"record[{index}].height_cm")
-        normalized["weight_kg"] = cls._require_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg")
+        normalized["weight_kg"] = cls._optional_non_negative_int(record.get("weight_kg"), f"record[{index}].weight_kg")
        normalized["games_played"] = cls._require_non_negative_int(record.get("games_played"), f"record[{index}].games_played")
        for field in (
--- a/tests/fixtures/bcl/bcl_players_stats_partial_public.json
+++ b/tests/fixtures/bcl/bcl_players_stats_partial_public.json
@@ -0,0 +1,25 @@
 {
  "data": [
    {
      "player": {
        "id": "bcl-player-99",
        "name": "Alex Novak"
      },
      "team": {
        "id": "bcl-team-tenerife",
        "name": "Lenovo Tenerife"
      },
      "gp": 10,
      "mpg": 27.2,
      "ppg": 14.8,
      "rpg": 4.1,
      "apg": 3.3,
      "spg": 1.2,
      "bpg": 0.4,
      "tov": 2.0,
      "fg_pct": 47.3,
      "three_pct": 38.0,
      "ft_pct": 79.1
    }
  ]
 }
--- a/tests/fixtures/lba/lba_players_stats_partial_public.json
+++ b/tests/fixtures/lba/lba_players_stats_partial_public.json
@@ -0,0 +1,25 @@
 {
  "data": [
    {
      "player": {
        "id": "p-002",
        "name": "Andrea Bianchi"
      },
      "team": {
        "id": "team-olimpia-milano",
        "name": "Olimpia Milano"
      },
      "gp": 18,
      "mpg": 24.7,
      "ppg": 12.3,
      "rpg": 2.9,
      "apg": 4.2,
      "spg": 1.1,
      "bpg": 0.1,
      "tov": 1.8,
      "fg_pct": 45.0,
      "three_pct": 35.4,
      "ft_pct": 82.7
    }
  ]
 }
--- a/tests/test_bcl_extractor.py
+++ b/tests/test_bcl_extractor.py
@@ -8,6 +8,7 @@ import pytest
 from django.core.management import call_command
 from apps.ingestion.extractors.bcl import BCLSnapshotExtractor
 from apps.ingestion.extractors.base import ExtractorNormalizationError
 from apps.ingestion.extractors.registry import create_extractor
@@ -51,6 +52,56 @@ def test_bcl_extractor_normalizes_fixture_payload(tmp_path, settings):
    assert row["three_pt_pct"] == 37.2
@pytest.mark.django_db
 def test_bcl_extractor_accepts_partial_public_player_bio_fields(tmp_path, settings):
    settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json"
    settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026"
    settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl"
    settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League"
    fixture_payload = _load_fixture("bcl/bcl_players_stats_partial_public.json")
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return fixture_payload
    extractor = BCLSnapshotExtractor(http_client=FakeClient())
    output_path = tmp_path / "bcl-partial.json"
    result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13))
    assert result.records_count == 1
    payload = json.loads(output_path.read_text(encoding="utf-8"))
    row = payload["records"][0]
    assert row["full_name"] == "Alex Novak"
    assert row["first_name"] is None
    assert row["last_name"] is None
    assert row["birth_date"] is None
    assert row["nationality"] is None
    assert row["height_cm"] is None
    assert row["weight_kg"] is None
    assert row["position"] is None
    assert row["games_played"] == 10
@pytest.mark.django_db
 def test_bcl_extractor_still_fails_when_required_stats_are_missing(settings):
    settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json"
    settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026"
    settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl"
    settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League"
    fixture_payload = _load_fixture("bcl/bcl_players_stats_partial_public.json")
    fixture_payload["data"][0].pop("ppg")
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return fixture_payload
    extractor = BCLSnapshotExtractor(http_client=FakeClient())
    with pytest.raises(ExtractorNormalizationError):
        extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
@pytest.mark.django_db
 def test_bcl_extractor_registry_selection(settings):
    settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json"
--- a/tests/test_extractors_framework.py
+++ b/tests/test_extractors_framework.py
@@ -7,8 +7,10 @@ import pytest
 from django.core.management import call_command
 from apps.ingestion.extractors.base import BaseSnapshotExtractor
 from apps.ingestion.extractors.base import ExtractorNormalizationError
 from apps.ingestion.extractors.http import ResponsibleHttpClient
 from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor
 from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS
 class DummyExtractor(BaseSnapshotExtractor):
@@ -64,6 +66,29 @@ class _FakeResponse:
        return self._payload
 def _minimal_public_json_record() -> dict:
    return {
        "competition_external_id": "comp-1",
        "competition_name": "League One",
        "season": "2025-2026",
        "team_external_id": "team-1",
        "team_name": "Team One",
        "player_external_id": "player-1",
        "full_name": "Jane Doe",
        "games_played": 12,
        "minutes_per_game": 27.2,
        "points_per_game": 13.0,
        "rebounds_per_game": 4.4,
        "assists_per_game": 3.1,
        "steals_per_game": 1.0,
        "blocks_per_game": 0.3,
        "turnovers_per_game": 1.8,
        "fg_pct": 46.2,
        "three_pt_pct": 35.5,
        "ft_pct": 82.1,
    }
@pytest.mark.django_db
 def test_base_extractor_run_writes_snapshot_file(tmp_path, settings):
    settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming")
@@ -135,6 +160,71 @@ def test_public_json_extractor_normalizes_common_field_aliases(tmp_path):
    assert row["three_pt_pct"] == 36.1
@pytest.mark.django_db
 def test_public_json_extractor_accepts_missing_optional_bio_and_physical_fields(tmp_path):
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return {"records": [_minimal_public_json_record()]}
    extractor = PublicJsonSnapshotExtractor(
        url="https://example.com/public-feed.json",
        source_name="test_public_feed",
        http_client=FakeClient(),
    )
    output_file = tmp_path / "public-optional.json"
    result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
    assert result.records_count == 1
    payload = json.loads(output_file.read_text(encoding="utf-8"))
    row = payload["records"][0]
    assert row["full_name"] == "Jane Doe"
    assert row["first_name"] is None
    assert row["last_name"] is None
    assert row["birth_date"] is None
    assert row["nationality"] is None
    assert row["height_cm"] is None
    assert row["weight_kg"] is None
    assert row["position"] is None
    assert row.get("role") is None
@pytest.mark.django_db
 def test_public_json_extractor_fails_when_required_stat_missing():
    broken = _minimal_public_json_record()
    broken.pop("points_per_game")
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return {"records": [broken]}
    extractor = PublicJsonSnapshotExtractor(
        url="https://example.com/public-feed.json",
        source_name="test_public_feed",
        http_client=FakeClient(),
    )
    with pytest.raises(ExtractorNormalizationError):
        extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
@pytest.mark.django_db
@pytest.mark.parametrize("required_field", sorted(REQUIRED_RECORD_FIELDS))
 def test_public_json_required_fields_follow_snapshot_schema(required_field):
    broken = _minimal_public_json_record()
    broken.pop(required_field)
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return {"records": [broken]}
    extractor = PublicJsonSnapshotExtractor(
        url="https://example.com/public-feed.json",
        source_name="test_public_feed",
        http_client=FakeClient(),
    )
    with pytest.raises(ExtractorNormalizationError, match="missing required fields"):
        extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
@pytest.mark.django_db
 def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
    settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
--- a/tests/test_import_snapshots_command.py
+++ b/tests/test_import_snapshots_command.py
@@ -103,6 +103,116 @@ def test_valid_snapshot_import(tmp_path, settings):
    assert PlayerSeasonStats.objects.count() == 1
@pytest.mark.django_db
 def test_snapshot_import_succeeds_with_optional_bio_and_physical_fields_missing(tmp_path, settings):
    incoming = tmp_path / "incoming"
    archive = tmp_path / "archive"
    failed = tmp_path / "failed"
    incoming.mkdir()
    archive.mkdir()
    failed.mkdir()
    payload = _valid_payload()
    for optional_field in ("first_name", "last_name", "birth_date", "nationality", "height_cm", "weight_kg", "position", "role"):
        payload["records"][0].pop(optional_field, None)
    file_path = incoming / "optional-missing.json"
    _write_json(file_path, payload)
    settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
    settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
    settings.STATIC_DATASET_FAILED_DIR = str(failed)
    call_command("import_snapshots")
    run = ImportRun.objects.get()
    assert run.status == ImportRun.RunStatus.SUCCESS
    player = Player.objects.get(source_uid="player-23")
    assert player.first_name == "LeBron"
    assert player.last_name == "James"
    assert player.birth_date is None
    assert player.nationality is None
    assert player.nominal_position is None
    assert player.height_cm is None
    assert player.weight_kg is None
    assert PlayerSeasonStats.objects.count() == 1
@pytest.mark.django_db
 def test_snapshot_import_preserves_single_name_part_without_forced_split(tmp_path, settings):
    incoming = tmp_path / "incoming"
    archive = tmp_path / "archive"
    failed = tmp_path / "failed"
    incoming.mkdir()
    archive.mkdir()
    failed.mkdir()
    payload = _valid_payload()
    row = payload["records"][0]
    row["first_name"] = "LeBron"
    row.pop("last_name")
    file_path = incoming / "single-name-part.json"
    _write_json(file_path, payload)
    settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
    settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
    settings.STATIC_DATASET_FAILED_DIR = str(failed)
    call_command("import_snapshots")
    run = ImportRun.objects.get()
    assert run.status == ImportRun.RunStatus.SUCCESS
    player = Player.objects.get(source_uid="player-23")
    assert player.first_name == "LeBron"
    assert player.last_name == ""
@pytest.mark.django_db
@pytest.mark.parametrize(
    ("source_name", "competition_id", "competition_name"),
    [
        ("lba", "lba-serie-a", "Lega Basket Serie A"),
        ("bcl", "bcl", "Basketball Champions League"),
    ],
 )
 def test_partial_public_source_snapshot_imports_for_lba_and_bcl(
    tmp_path,
    settings,
    source_name,
    competition_id,
    competition_name,
 ):
    incoming = tmp_path / "incoming"
    archive = tmp_path / "archive"
    failed = tmp_path / "failed"
    incoming.mkdir()
    archive.mkdir()
    failed.mkdir()
    payload = _valid_payload()
    payload["source_name"] = source_name
    row = payload["records"][0]
    row["competition_external_id"] = competition_id
    row["competition_name"] = competition_name
    for optional_field in ("first_name", "last_name", "birth_date", "nationality", "height_cm", "weight_kg", "position", "role"):
        row.pop(optional_field, None)
    _write_json(incoming / f"{source_name}.json", payload)
    settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
    settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
    settings.STATIC_DATASET_FAILED_DIR = str(failed)
    call_command("import_snapshots")
    run = ImportRun.objects.get()
    assert run.status == ImportRun.RunStatus.SUCCESS
    assert Competition.objects.filter(source_uid=competition_id, name=competition_name).exists()
    assert Player.objects.filter(source_uid="player-23").exists()
    assert PlayerSeasonStats.objects.count() == 1
@pytest.mark.django_db
 def test_invalid_snapshot_rejected_and_moved_to_failed(tmp_path, settings):
    incoming = tmp_path / "incoming"
--- a/tests/test_lba_extractor.py
+++ b/tests/test_lba_extractor.py
@@ -8,6 +8,7 @@ import pytest
 from django.core.management import call_command
 from apps.ingestion.extractors.lba import LBASnapshotExtractor
 from apps.ingestion.extractors.base import ExtractorNormalizationError
 from apps.ingestion.extractors.registry import create_extractor
@@ -51,6 +52,56 @@ def test_lba_extractor_normalizes_fixture_payload(tmp_path, settings):
    assert row["three_pt_pct"] == 36.5
@pytest.mark.django_db
 def test_lba_extractor_accepts_partial_public_player_bio_fields(tmp_path, settings):
    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
    settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a"
    settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A"
    fixture_payload = _load_fixture("lba/lba_players_stats_partial_public.json")
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return fixture_payload
    extractor = LBASnapshotExtractor(http_client=FakeClient())
    output_path = tmp_path / "lba-partial.json"
    result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13))
    assert result.records_count == 1
    payload = json.loads(output_path.read_text(encoding="utf-8"))
    row = payload["records"][0]
    assert row["full_name"] == "Andrea Bianchi"
    assert row["first_name"] is None
    assert row["last_name"] is None
    assert row["birth_date"] is None
    assert row["nationality"] is None
    assert row["height_cm"] is None
    assert row["weight_kg"] is None
    assert row["position"] is None
    assert row["games_played"] == 18
@pytest.mark.django_db
 def test_lba_extractor_still_fails_when_required_stats_are_missing(settings):
    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
    settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a"
    settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A"
    fixture_payload = _load_fixture("lba/lba_players_stats_partial_public.json")
    fixture_payload["data"][0].pop("ppg")
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return fixture_payload
    extractor = LBASnapshotExtractor(http_client=FakeClient())
    with pytest.raises(ExtractorNormalizationError):
        extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
@pytest.mark.django_db
 def test_lba_extractor_registry_selection(settings):
    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"