diff --git a/README.md b/README.md index 4682c67..d28b5ae 100644 --- a/README.md +++ b/README.md @@ -167,6 +167,12 @@ Validation is strict: - numeric fields must be numeric - invalid files are moved to failed directory +Importer enrichment note: +- `full_name` is source truth for identity display +- `first_name` / `last_name` are optional and may be absent in public snapshots +- when both are missing, importer may derive them from `full_name` as a best-effort enrichment step +- this enrichment is convenience-only and does not override source truth semantics + ## Import Command Run import: @@ -284,6 +290,7 @@ Notes: - extraction is intentionally low-frequency and uses retries conservatively - only public pages/endpoints should be targeted - emitted snapshots must match the same schema consumed by `import_snapshots` +- `public_json_snapshot` uses the same required-vs-optional field contract as `SnapshotSchemaValidator` (no stricter extractor-only required bio/physical fields) - optional scheduler container runs `scripts/scheduler.sh` loop using: - image: `registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}` - command: `/app/scripts/scheduler.sh` @@ -326,6 +333,16 @@ Notes: - public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields - no live HTTP calls in tests; tests use fixtures/mocked responses only +## Testing + +- runtime `web` image stays lean and may not include `pytest` tooling +- run tests with the development compose stack (or a dedicated test image/profile) where test dependencies are installed +- local example: + +```bash +docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm web pytest -q +``` + ## Migration and Superuser Commands ```bash diff --git a/apps/ingestion/extractors/public_json.py b/apps/ingestion/extractors/public_json.py index acb973d..4241d0b 100644 --- a/apps/ingestion/extractors/public_json.py +++ b/apps/ingestion/extractors/public_json.py @@ -4,6 +4,8 @@ from typing import Any from django.conf import settings +from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS + from .base import ( BaseSnapshotExtractor, ExtractorConfigError, @@ -113,7 +115,7 @@ class PublicJsonSnapshotExtractor(BaseSnapshotExtractor): "ft_pct": _first_non_empty(source_record, "ft_pct"), } - missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + missing = [key for key in REQUIRED_RECORD_FIELDS if normalized.get(key) in (None, "")] if missing: raise ExtractorNormalizationError( f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}" diff --git a/tests/test_bcl_extractor.py b/tests/test_bcl_extractor.py index b5795fe..daa93dc 100644 --- a/tests/test_bcl_extractor.py +++ b/tests/test_bcl_extractor.py @@ -8,6 +8,7 @@ import pytest from django.core.management import call_command from apps.ingestion.extractors.bcl import BCLSnapshotExtractor +from apps.ingestion.extractors.base import ExtractorNormalizationError from apps.ingestion.extractors.registry import create_extractor @@ -82,6 +83,25 @@ def test_bcl_extractor_accepts_partial_public_player_bio_fields(tmp_path, settin assert row["games_played"] == 10 +@pytest.mark.django_db +def test_bcl_extractor_still_fails_when_required_stats_are_missing(settings): + settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" + settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl" + settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League" + + fixture_payload = _load_fixture("bcl/bcl_players_stats_partial_public.json") + fixture_payload["data"][0].pop("ppg") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = BCLSnapshotExtractor(http_client=FakeClient()) + with pytest.raises(ExtractorNormalizationError): + extractor.run(write_output=False, snapshot_date=date(2026, 3, 13)) + + @pytest.mark.django_db def test_bcl_extractor_registry_selection(settings): settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" diff --git a/tests/test_extractors_framework.py b/tests/test_extractors_framework.py index cb0dd74..1d873dd 100644 --- a/tests/test_extractors_framework.py +++ b/tests/test_extractors_framework.py @@ -135,6 +135,57 @@ def test_public_json_extractor_normalizes_common_field_aliases(tmp_path): assert row["three_pt_pct"] == 36.1 +@pytest.mark.django_db +def test_public_json_extractor_accepts_missing_optional_bio_and_physical_fields(tmp_path): + class FakeClient: + def get_json(self, *_args, **_kwargs): + return { + "records": [ + { + "competition_external_id": "comp-1", + "competition_name": "League One", + "season": "2025-2026", + "team_external_id": "team-1", + "team_name": "Team One", + "player_external_id": "player-1", + "full_name": "Jane Doe", + "games_played": 12, + "minutes_per_game": 27.2, + "points_per_game": 13.0, + "rebounds_per_game": 4.4, + "assists_per_game": 3.1, + "steals_per_game": 1.0, + "blocks_per_game": 0.3, + "turnovers_per_game": 1.8, + "fg_pct": 46.2, + "three_pt_pct": 35.5, + "ft_pct": 82.1, + } + ] + } + + extractor = PublicJsonSnapshotExtractor( + url="https://example.com/public-feed.json", + source_name="test_public_feed", + http_client=FakeClient(), + ) + output_file = tmp_path / "public-optional.json" + result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + payload = json.loads(output_file.read_text(encoding="utf-8")) + row = payload["records"][0] + assert row["full_name"] == "Jane Doe" + assert row["first_name"] is None + assert row["last_name"] is None + assert row["birth_date"] is None + assert row["nationality"] is None + assert row["height_cm"] is None + assert row["weight_kg"] is None + assert row["position"] is None + assert row.get("role") is None + + @pytest.mark.django_db def test_run_extractor_management_command_writes_snapshot(tmp_path, settings): settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"