Files
hoopscout/tests/test_extractors_framework.py

313 lines
11 KiB
Python

from __future__ import annotations
import json
from datetime import date
import pytest
from django.core.management import call_command
from apps.ingestion.extractors.base import BaseSnapshotExtractor
from apps.ingestion.extractors.base import ExtractorNormalizationError
from apps.ingestion.extractors.http import ResponsibleHttpClient
from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor
from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS
class DummyExtractor(BaseSnapshotExtractor):
extractor_name = "dummy"
source_name = "dummy_source"
def fetch(self):
return {"rows": [{"name": "Jane Doe"}]}
def parse(self, payload):
return payload["rows"]
def normalize_record(self, source_record):
return {
"competition_external_id": "comp-1",
"competition_name": "League One",
"season": "2025-2026",
"team_external_id": "team-1",
"team_name": "Team One",
"player_external_id": "player-1",
"full_name": source_record["name"],
"first_name": "Jane",
"last_name": "Doe",
"birth_date": "2000-01-01",
"nationality": "US",
"height_cm": 180,
"weight_kg": 75,
"position": "SG",
"games_played": 10,
"minutes_per_game": 30.0,
"points_per_game": 15.0,
"rebounds_per_game": 4.0,
"assists_per_game": 3.0,
"steals_per_game": 1.2,
"blocks_per_game": 0.4,
"turnovers_per_game": 2.0,
"fg_pct": 45.0,
"three_pt_pct": 35.0,
"ft_pct": 82.0,
}
class _FakeResponse:
def __init__(self, payload, status_code=200):
self._payload = payload
self.status_code = status_code
def raise_for_status(self):
if self.status_code >= 400:
raise RuntimeError(f"status={self.status_code}")
def json(self):
return self._payload
def _minimal_public_json_record() -> dict:
return {
"competition_external_id": "comp-1",
"competition_name": "League One",
"season": "2025-2026",
"team_external_id": "team-1",
"team_name": "Team One",
"player_external_id": "player-1",
"full_name": "Jane Doe",
"games_played": 12,
"minutes_per_game": 27.2,
"points_per_game": 13.0,
"rebounds_per_game": 4.4,
"assists_per_game": 3.1,
"steals_per_game": 1.0,
"blocks_per_game": 0.3,
"turnovers_per_game": 1.8,
"fg_pct": 46.2,
"three_pt_pct": 35.5,
"ft_pct": 82.1,
}
@pytest.mark.django_db
def test_base_extractor_run_writes_snapshot_file(tmp_path, settings):
settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming")
extractor = DummyExtractor()
result = extractor.run(snapshot_date=date(2026, 3, 13))
assert result.records_count == 1
assert result.source_name == "dummy_source"
assert result.output_path is not None
assert result.output_path.exists()
payload = json.loads(result.output_path.read_text(encoding="utf-8"))
assert payload["source_name"] == "dummy_source"
assert payload["snapshot_date"] == "2026-03-13"
assert payload["records"][0]["full_name"] == "Jane Doe"
@pytest.mark.django_db
def test_public_json_extractor_normalizes_common_field_aliases(tmp_path):
class FakeClient:
def get_json(self, *_args, **_kwargs):
return {
"records": [
{
"competition_id": 99,
"competition_name": "National League",
"season": 2025,
"team_id": 10,
"team_name": "Blue Team",
"player_id": 123,
"player_name": "John Smith",
"first_name": "John",
"last_name": "Smith",
"birth_date": "2001-05-12",
"nationality": "US",
"height_cm": 198,
"weight_kg": 96,
"position": "SF",
"gp": 20,
"mpg": 28.5,
"ppg": 14.2,
"rpg": 5.1,
"apg": 3.2,
"spg": 1.1,
"bpg": 0.5,
"tov": 1.9,
"fg_pct": 47.3,
"three_pct": 36.1,
"ft_pct": 80.0,
}
]
}
extractor = PublicJsonSnapshotExtractor(
url="https://example.com/public-feed.json",
source_name="test_public_feed",
http_client=FakeClient(),
)
output_file = tmp_path / "public.json"
result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
assert result.records_count == 1
payload = json.loads(output_file.read_text(encoding="utf-8"))
row = payload["records"][0]
assert row["competition_external_id"] == "99"
assert row["team_external_id"] == "10"
assert row["player_external_id"] == "123"
assert row["full_name"] == "John Smith"
assert row["three_pt_pct"] == 36.1
@pytest.mark.django_db
def test_public_json_extractor_accepts_missing_optional_bio_and_physical_fields(tmp_path):
class FakeClient:
def get_json(self, *_args, **_kwargs):
return {"records": [_minimal_public_json_record()]}
extractor = PublicJsonSnapshotExtractor(
url="https://example.com/public-feed.json",
source_name="test_public_feed",
http_client=FakeClient(),
)
output_file = tmp_path / "public-optional.json"
result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
assert result.records_count == 1
payload = json.loads(output_file.read_text(encoding="utf-8"))
row = payload["records"][0]
assert row["full_name"] == "Jane Doe"
assert row["first_name"] is None
assert row["last_name"] is None
assert row["birth_date"] is None
assert row["nationality"] is None
assert row["height_cm"] is None
assert row["weight_kg"] is None
assert row["position"] is None
assert row.get("role") is None
@pytest.mark.django_db
def test_public_json_extractor_fails_when_required_stat_missing():
broken = _minimal_public_json_record()
broken.pop("points_per_game")
class FakeClient:
def get_json(self, *_args, **_kwargs):
return {"records": [broken]}
extractor = PublicJsonSnapshotExtractor(
url="https://example.com/public-feed.json",
source_name="test_public_feed",
http_client=FakeClient(),
)
with pytest.raises(ExtractorNormalizationError):
extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
@pytest.mark.django_db
@pytest.mark.parametrize("required_field", sorted(REQUIRED_RECORD_FIELDS))
def test_public_json_required_fields_follow_snapshot_schema(required_field):
broken = _minimal_public_json_record()
broken.pop(required_field)
class FakeClient:
def get_json(self, *_args, **_kwargs):
return {"records": [broken]}
extractor = PublicJsonSnapshotExtractor(
url="https://example.com/public-feed.json",
source_name="test_public_feed",
http_client=FakeClient(),
)
with pytest.raises(ExtractorNormalizationError, match="missing required fields"):
extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
@pytest.mark.django_db
def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
settings.EXTRACTOR_PUBLIC_SOURCE_NAME = "cmd_test_source"
output_dir = tmp_path / "snapshots"
class FakeClient:
def get_json(self, *_args, **_kwargs):
return {
"records": [
{
"competition_external_id": "comp-a",
"competition_name": "Alpha League",
"season": "2025-2026",
"team_external_id": "team-a",
"team_name": "Alpha Team",
"player_external_id": "player-a",
"full_name": "Alpha Player",
"first_name": "Alpha",
"last_name": "Player",
"birth_date": "2000-04-01",
"nationality": "US",
"height_cm": 190,
"weight_kg": 88,
"position": "PG",
"games_played": 12,
"minutes_per_game": 31.0,
"points_per_game": 17.0,
"rebounds_per_game": 4.0,
"assists_per_game": 6.0,
"steals_per_game": 1.3,
"blocks_per_game": 0.1,
"turnovers_per_game": 2.4,
"fg_pct": 44.0,
"three_pt_pct": 37.0,
"ft_pct": 79.0,
}
]
}
monkeypatch = pytest.MonkeyPatch()
monkeypatch.setattr(
"apps.ingestion.extractors.public_json.ResponsibleHttpClient",
lambda **_kwargs: FakeClient(),
)
try:
call_command(
"run_extractor",
"public_json_snapshot",
"--output-path",
str(output_dir),
"--snapshot-date",
"2026-03-13",
)
finally:
monkeypatch.undo()
files = list(output_dir.glob("public_json_snapshot-2026-03-13.json"))
assert len(files) == 1
payload = json.loads(files[0].read_text(encoding="utf-8"))
assert payload["source_name"] == "cmd_test_source"
assert payload["records"][0]["full_name"] == "Alpha Player"
def test_http_client_retries_on_retryable_status(monkeypatch):
class FakeSession:
def __init__(self):
self.calls = 0
def get(self, *_args, **_kwargs):
self.calls += 1
if self.calls == 1:
return _FakeResponse({"error": "busy"}, status_code=429)
return _FakeResponse({"records": []}, status_code=200)
client = ResponsibleHttpClient(
user_agent="test-agent",
timeout_seconds=5,
retries=1,
retry_sleep_seconds=0,
request_delay_seconds=0,
session=FakeSession(),
)
payload = client.get_json("https://example.com/feed.json")
assert payload == {"records": []}