fix(v2-ingestion): align public schema realism follow-ups
This commit is contained in:
17
README.md
17
README.md
@ -167,6 +167,12 @@ Validation is strict:
|
|||||||
- numeric fields must be numeric
|
- numeric fields must be numeric
|
||||||
- invalid files are moved to failed directory
|
- invalid files are moved to failed directory
|
||||||
|
|
||||||
|
Importer enrichment note:
|
||||||
|
- `full_name` is source truth for identity display
|
||||||
|
- `first_name` / `last_name` are optional and may be absent in public snapshots
|
||||||
|
- when both are missing, importer may derive them from `full_name` as a best-effort enrichment step
|
||||||
|
- this enrichment is convenience-only and does not override source truth semantics
|
||||||
|
|
||||||
## Import Command
|
## Import Command
|
||||||
|
|
||||||
Run import:
|
Run import:
|
||||||
@ -284,6 +290,7 @@ Notes:
|
|||||||
- extraction is intentionally low-frequency and uses retries conservatively
|
- extraction is intentionally low-frequency and uses retries conservatively
|
||||||
- only public pages/endpoints should be targeted
|
- only public pages/endpoints should be targeted
|
||||||
- emitted snapshots must match the same schema consumed by `import_snapshots`
|
- emitted snapshots must match the same schema consumed by `import_snapshots`
|
||||||
|
- `public_json_snapshot` uses the same required-vs-optional field contract as `SnapshotSchemaValidator` (no stricter extractor-only required bio/physical fields)
|
||||||
- optional scheduler container runs `scripts/scheduler.sh` loop using:
|
- optional scheduler container runs `scripts/scheduler.sh` loop using:
|
||||||
- image: `registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}`
|
- image: `registry.younerd.org/hoopscout/scheduler:${APP_IMAGE_TAG:-latest}`
|
||||||
- command: `/app/scripts/scheduler.sh`
|
- command: `/app/scripts/scheduler.sh`
|
||||||
@ -326,6 +333,16 @@ Notes:
|
|||||||
- public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields
|
- public-source player bio/physical fields are often incomplete; extractor allows them to be missing and emits `null` for optional fields
|
||||||
- no live HTTP calls in tests; tests use fixtures/mocked responses only
|
- no live HTTP calls in tests; tests use fixtures/mocked responses only
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
- runtime `web` image stays lean and may not include `pytest` tooling
|
||||||
|
- run tests with the development compose stack (or a dedicated test image/profile) where test dependencies are installed
|
||||||
|
- local example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm web pytest -q
|
||||||
|
```
|
||||||
|
|
||||||
## Migration and Superuser Commands
|
## Migration and Superuser Commands
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@ -4,6 +4,8 @@ from typing import Any
|
|||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
BaseSnapshotExtractor,
|
BaseSnapshotExtractor,
|
||||||
ExtractorConfigError,
|
ExtractorConfigError,
|
||||||
@ -113,7 +115,7 @@ class PublicJsonSnapshotExtractor(BaseSnapshotExtractor):
|
|||||||
"ft_pct": _first_non_empty(source_record, "ft_pct"),
|
"ft_pct": _first_non_empty(source_record, "ft_pct"),
|
||||||
}
|
}
|
||||||
|
|
||||||
missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
|
missing = [key for key in REQUIRED_RECORD_FIELDS if normalized.get(key) in (None, "")]
|
||||||
if missing:
|
if missing:
|
||||||
raise ExtractorNormalizationError(
|
raise ExtractorNormalizationError(
|
||||||
f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}"
|
f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}"
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import pytest
|
|||||||
from django.core.management import call_command
|
from django.core.management import call_command
|
||||||
|
|
||||||
from apps.ingestion.extractors.bcl import BCLSnapshotExtractor
|
from apps.ingestion.extractors.bcl import BCLSnapshotExtractor
|
||||||
|
from apps.ingestion.extractors.base import ExtractorNormalizationError
|
||||||
from apps.ingestion.extractors.registry import create_extractor
|
from apps.ingestion.extractors.registry import create_extractor
|
||||||
|
|
||||||
|
|
||||||
@ -82,6 +83,25 @@ def test_bcl_extractor_accepts_partial_public_player_bio_fields(tmp_path, settin
|
|||||||
assert row["games_played"] == 10
|
assert row["games_played"] == 10
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_bcl_extractor_still_fails_when_required_stats_are_missing(settings):
|
||||||
|
settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json"
|
||||||
|
settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026"
|
||||||
|
settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl"
|
||||||
|
settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League"
|
||||||
|
|
||||||
|
fixture_payload = _load_fixture("bcl/bcl_players_stats_partial_public.json")
|
||||||
|
fixture_payload["data"][0].pop("ppg")
|
||||||
|
|
||||||
|
class FakeClient:
|
||||||
|
def get_json(self, *_args, **_kwargs):
|
||||||
|
return fixture_payload
|
||||||
|
|
||||||
|
extractor = BCLSnapshotExtractor(http_client=FakeClient())
|
||||||
|
with pytest.raises(ExtractorNormalizationError):
|
||||||
|
extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_bcl_extractor_registry_selection(settings):
|
def test_bcl_extractor_registry_selection(settings):
|
||||||
settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json"
|
settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json"
|
||||||
|
|||||||
@ -135,6 +135,57 @@ def test_public_json_extractor_normalizes_common_field_aliases(tmp_path):
|
|||||||
assert row["three_pt_pct"] == 36.1
|
assert row["three_pt_pct"] == 36.1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_public_json_extractor_accepts_missing_optional_bio_and_physical_fields(tmp_path):
|
||||||
|
class FakeClient:
|
||||||
|
def get_json(self, *_args, **_kwargs):
|
||||||
|
return {
|
||||||
|
"records": [
|
||||||
|
{
|
||||||
|
"competition_external_id": "comp-1",
|
||||||
|
"competition_name": "League One",
|
||||||
|
"season": "2025-2026",
|
||||||
|
"team_external_id": "team-1",
|
||||||
|
"team_name": "Team One",
|
||||||
|
"player_external_id": "player-1",
|
||||||
|
"full_name": "Jane Doe",
|
||||||
|
"games_played": 12,
|
||||||
|
"minutes_per_game": 27.2,
|
||||||
|
"points_per_game": 13.0,
|
||||||
|
"rebounds_per_game": 4.4,
|
||||||
|
"assists_per_game": 3.1,
|
||||||
|
"steals_per_game": 1.0,
|
||||||
|
"blocks_per_game": 0.3,
|
||||||
|
"turnovers_per_game": 1.8,
|
||||||
|
"fg_pct": 46.2,
|
||||||
|
"three_pt_pct": 35.5,
|
||||||
|
"ft_pct": 82.1,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
extractor = PublicJsonSnapshotExtractor(
|
||||||
|
url="https://example.com/public-feed.json",
|
||||||
|
source_name="test_public_feed",
|
||||||
|
http_client=FakeClient(),
|
||||||
|
)
|
||||||
|
output_file = tmp_path / "public-optional.json"
|
||||||
|
result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
|
||||||
|
|
||||||
|
assert result.records_count == 1
|
||||||
|
payload = json.loads(output_file.read_text(encoding="utf-8"))
|
||||||
|
row = payload["records"][0]
|
||||||
|
assert row["full_name"] == "Jane Doe"
|
||||||
|
assert row["first_name"] is None
|
||||||
|
assert row["last_name"] is None
|
||||||
|
assert row["birth_date"] is None
|
||||||
|
assert row["nationality"] is None
|
||||||
|
assert row["height_cm"] is None
|
||||||
|
assert row["weight_kg"] is None
|
||||||
|
assert row["position"] is None
|
||||||
|
assert row.get("role") is None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
|
def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
|
||||||
settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
|
settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
|
||||||
|
|||||||
Reference in New Issue
Block a user