313 lines
11 KiB
Python
313 lines
11 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from datetime import date
|
|
|
|
import pytest
|
|
from django.core.management import call_command
|
|
|
|
from apps.ingestion.extractors.base import BaseSnapshotExtractor
|
|
from apps.ingestion.extractors.base import ExtractorNormalizationError
|
|
from apps.ingestion.extractors.http import ResponsibleHttpClient
|
|
from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor
|
|
from apps.ingestion.snapshots.schema import REQUIRED_RECORD_FIELDS
|
|
|
|
|
|
class DummyExtractor(BaseSnapshotExtractor):
|
|
extractor_name = "dummy"
|
|
source_name = "dummy_source"
|
|
|
|
def fetch(self):
|
|
return {"rows": [{"name": "Jane Doe"}]}
|
|
|
|
def parse(self, payload):
|
|
return payload["rows"]
|
|
|
|
def normalize_record(self, source_record):
|
|
return {
|
|
"competition_external_id": "comp-1",
|
|
"competition_name": "League One",
|
|
"season": "2025-2026",
|
|
"team_external_id": "team-1",
|
|
"team_name": "Team One",
|
|
"player_external_id": "player-1",
|
|
"full_name": source_record["name"],
|
|
"first_name": "Jane",
|
|
"last_name": "Doe",
|
|
"birth_date": "2000-01-01",
|
|
"nationality": "US",
|
|
"height_cm": 180,
|
|
"weight_kg": 75,
|
|
"position": "SG",
|
|
"games_played": 10,
|
|
"minutes_per_game": 30.0,
|
|
"points_per_game": 15.0,
|
|
"rebounds_per_game": 4.0,
|
|
"assists_per_game": 3.0,
|
|
"steals_per_game": 1.2,
|
|
"blocks_per_game": 0.4,
|
|
"turnovers_per_game": 2.0,
|
|
"fg_pct": 45.0,
|
|
"three_pt_pct": 35.0,
|
|
"ft_pct": 82.0,
|
|
}
|
|
|
|
|
|
class _FakeResponse:
|
|
def __init__(self, payload, status_code=200):
|
|
self._payload = payload
|
|
self.status_code = status_code
|
|
|
|
def raise_for_status(self):
|
|
if self.status_code >= 400:
|
|
raise RuntimeError(f"status={self.status_code}")
|
|
|
|
def json(self):
|
|
return self._payload
|
|
|
|
|
|
def _minimal_public_json_record() -> dict:
|
|
return {
|
|
"competition_external_id": "comp-1",
|
|
"competition_name": "League One",
|
|
"season": "2025-2026",
|
|
"team_external_id": "team-1",
|
|
"team_name": "Team One",
|
|
"player_external_id": "player-1",
|
|
"full_name": "Jane Doe",
|
|
"games_played": 12,
|
|
"minutes_per_game": 27.2,
|
|
"points_per_game": 13.0,
|
|
"rebounds_per_game": 4.4,
|
|
"assists_per_game": 3.1,
|
|
"steals_per_game": 1.0,
|
|
"blocks_per_game": 0.3,
|
|
"turnovers_per_game": 1.8,
|
|
"fg_pct": 46.2,
|
|
"three_pt_pct": 35.5,
|
|
"ft_pct": 82.1,
|
|
}
|
|
|
|
|
|
@pytest.mark.django_db
|
|
def test_base_extractor_run_writes_snapshot_file(tmp_path, settings):
|
|
settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming")
|
|
extractor = DummyExtractor()
|
|
result = extractor.run(snapshot_date=date(2026, 3, 13))
|
|
|
|
assert result.records_count == 1
|
|
assert result.source_name == "dummy_source"
|
|
assert result.output_path is not None
|
|
assert result.output_path.exists()
|
|
|
|
payload = json.loads(result.output_path.read_text(encoding="utf-8"))
|
|
assert payload["source_name"] == "dummy_source"
|
|
assert payload["snapshot_date"] == "2026-03-13"
|
|
assert payload["records"][0]["full_name"] == "Jane Doe"
|
|
|
|
|
|
@pytest.mark.django_db
|
|
def test_public_json_extractor_normalizes_common_field_aliases(tmp_path):
|
|
class FakeClient:
|
|
def get_json(self, *_args, **_kwargs):
|
|
return {
|
|
"records": [
|
|
{
|
|
"competition_id": 99,
|
|
"competition_name": "National League",
|
|
"season": 2025,
|
|
"team_id": 10,
|
|
"team_name": "Blue Team",
|
|
"player_id": 123,
|
|
"player_name": "John Smith",
|
|
"first_name": "John",
|
|
"last_name": "Smith",
|
|
"birth_date": "2001-05-12",
|
|
"nationality": "US",
|
|
"height_cm": 198,
|
|
"weight_kg": 96,
|
|
"position": "SF",
|
|
"gp": 20,
|
|
"mpg": 28.5,
|
|
"ppg": 14.2,
|
|
"rpg": 5.1,
|
|
"apg": 3.2,
|
|
"spg": 1.1,
|
|
"bpg": 0.5,
|
|
"tov": 1.9,
|
|
"fg_pct": 47.3,
|
|
"three_pct": 36.1,
|
|
"ft_pct": 80.0,
|
|
}
|
|
]
|
|
}
|
|
|
|
extractor = PublicJsonSnapshotExtractor(
|
|
url="https://example.com/public-feed.json",
|
|
source_name="test_public_feed",
|
|
http_client=FakeClient(),
|
|
)
|
|
output_file = tmp_path / "public.json"
|
|
result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
|
|
|
|
assert result.records_count == 1
|
|
payload = json.loads(output_file.read_text(encoding="utf-8"))
|
|
row = payload["records"][0]
|
|
assert row["competition_external_id"] == "99"
|
|
assert row["team_external_id"] == "10"
|
|
assert row["player_external_id"] == "123"
|
|
assert row["full_name"] == "John Smith"
|
|
assert row["three_pt_pct"] == 36.1
|
|
|
|
|
|
@pytest.mark.django_db
|
|
def test_public_json_extractor_accepts_missing_optional_bio_and_physical_fields(tmp_path):
|
|
class FakeClient:
|
|
def get_json(self, *_args, **_kwargs):
|
|
return {"records": [_minimal_public_json_record()]}
|
|
|
|
extractor = PublicJsonSnapshotExtractor(
|
|
url="https://example.com/public-feed.json",
|
|
source_name="test_public_feed",
|
|
http_client=FakeClient(),
|
|
)
|
|
output_file = tmp_path / "public-optional.json"
|
|
result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
|
|
|
|
assert result.records_count == 1
|
|
payload = json.loads(output_file.read_text(encoding="utf-8"))
|
|
row = payload["records"][0]
|
|
assert row["full_name"] == "Jane Doe"
|
|
assert row["first_name"] is None
|
|
assert row["last_name"] is None
|
|
assert row["birth_date"] is None
|
|
assert row["nationality"] is None
|
|
assert row["height_cm"] is None
|
|
assert row["weight_kg"] is None
|
|
assert row["position"] is None
|
|
assert row.get("role") is None
|
|
|
|
|
|
@pytest.mark.django_db
|
|
def test_public_json_extractor_fails_when_required_stat_missing():
|
|
broken = _minimal_public_json_record()
|
|
broken.pop("points_per_game")
|
|
|
|
class FakeClient:
|
|
def get_json(self, *_args, **_kwargs):
|
|
return {"records": [broken]}
|
|
|
|
extractor = PublicJsonSnapshotExtractor(
|
|
url="https://example.com/public-feed.json",
|
|
source_name="test_public_feed",
|
|
http_client=FakeClient(),
|
|
)
|
|
with pytest.raises(ExtractorNormalizationError):
|
|
extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
|
|
|
|
|
|
@pytest.mark.django_db
|
|
@pytest.mark.parametrize("required_field", sorted(REQUIRED_RECORD_FIELDS))
|
|
def test_public_json_required_fields_follow_snapshot_schema(required_field):
|
|
broken = _minimal_public_json_record()
|
|
broken.pop(required_field)
|
|
|
|
class FakeClient:
|
|
def get_json(self, *_args, **_kwargs):
|
|
return {"records": [broken]}
|
|
|
|
extractor = PublicJsonSnapshotExtractor(
|
|
url="https://example.com/public-feed.json",
|
|
source_name="test_public_feed",
|
|
http_client=FakeClient(),
|
|
)
|
|
with pytest.raises(ExtractorNormalizationError, match="missing required fields"):
|
|
extractor.run(write_output=False, snapshot_date=date(2026, 3, 13))
|
|
|
|
|
|
@pytest.mark.django_db
|
|
def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
|
|
settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
|
|
settings.EXTRACTOR_PUBLIC_SOURCE_NAME = "cmd_test_source"
|
|
output_dir = tmp_path / "snapshots"
|
|
|
|
class FakeClient:
|
|
def get_json(self, *_args, **_kwargs):
|
|
return {
|
|
"records": [
|
|
{
|
|
"competition_external_id": "comp-a",
|
|
"competition_name": "Alpha League",
|
|
"season": "2025-2026",
|
|
"team_external_id": "team-a",
|
|
"team_name": "Alpha Team",
|
|
"player_external_id": "player-a",
|
|
"full_name": "Alpha Player",
|
|
"first_name": "Alpha",
|
|
"last_name": "Player",
|
|
"birth_date": "2000-04-01",
|
|
"nationality": "US",
|
|
"height_cm": 190,
|
|
"weight_kg": 88,
|
|
"position": "PG",
|
|
"games_played": 12,
|
|
"minutes_per_game": 31.0,
|
|
"points_per_game": 17.0,
|
|
"rebounds_per_game": 4.0,
|
|
"assists_per_game": 6.0,
|
|
"steals_per_game": 1.3,
|
|
"blocks_per_game": 0.1,
|
|
"turnovers_per_game": 2.4,
|
|
"fg_pct": 44.0,
|
|
"three_pt_pct": 37.0,
|
|
"ft_pct": 79.0,
|
|
}
|
|
]
|
|
}
|
|
|
|
monkeypatch = pytest.MonkeyPatch()
|
|
monkeypatch.setattr(
|
|
"apps.ingestion.extractors.public_json.ResponsibleHttpClient",
|
|
lambda **_kwargs: FakeClient(),
|
|
)
|
|
try:
|
|
call_command(
|
|
"run_extractor",
|
|
"public_json_snapshot",
|
|
"--output-path",
|
|
str(output_dir),
|
|
"--snapshot-date",
|
|
"2026-03-13",
|
|
)
|
|
finally:
|
|
monkeypatch.undo()
|
|
|
|
files = list(output_dir.glob("public_json_snapshot-2026-03-13.json"))
|
|
assert len(files) == 1
|
|
payload = json.loads(files[0].read_text(encoding="utf-8"))
|
|
assert payload["source_name"] == "cmd_test_source"
|
|
assert payload["records"][0]["full_name"] == "Alpha Player"
|
|
|
|
|
|
def test_http_client_retries_on_retryable_status(monkeypatch):
|
|
class FakeSession:
|
|
def __init__(self):
|
|
self.calls = 0
|
|
|
|
def get(self, *_args, **_kwargs):
|
|
self.calls += 1
|
|
if self.calls == 1:
|
|
return _FakeResponse({"error": "busy"}, status_code=429)
|
|
return _FakeResponse({"records": []}, status_code=200)
|
|
|
|
client = ResponsibleHttpClient(
|
|
user_agent="test-agent",
|
|
timeout_seconds=5,
|
|
retries=1,
|
|
retry_sleep_seconds=0,
|
|
request_delay_seconds=0,
|
|
session=FakeSession(),
|
|
)
|
|
payload = client.get_json("https://example.com/feed.json")
|
|
assert payload == {"records": []}
|