feat(v2): add snapshot extractor framework and run command
This commit is contained in:
222
tests/test_extractors_framework.py
Normal file
222
tests/test_extractors_framework.py
Normal file
@ -0,0 +1,222 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
from django.core.management import call_command
|
||||
|
||||
from apps.ingestion.extractors.base import BaseSnapshotExtractor
|
||||
from apps.ingestion.extractors.http import ResponsibleHttpClient
|
||||
from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor
|
||||
|
||||
|
||||
class DummyExtractor(BaseSnapshotExtractor):
|
||||
extractor_name = "dummy"
|
||||
source_name = "dummy_source"
|
||||
|
||||
def fetch(self):
|
||||
return {"rows": [{"name": "Jane Doe"}]}
|
||||
|
||||
def parse(self, payload):
|
||||
return payload["rows"]
|
||||
|
||||
def normalize_record(self, source_record):
|
||||
return {
|
||||
"competition_external_id": "comp-1",
|
||||
"competition_name": "League One",
|
||||
"season": "2025-2026",
|
||||
"team_external_id": "team-1",
|
||||
"team_name": "Team One",
|
||||
"player_external_id": "player-1",
|
||||
"full_name": source_record["name"],
|
||||
"first_name": "Jane",
|
||||
"last_name": "Doe",
|
||||
"birth_date": "2000-01-01",
|
||||
"nationality": "US",
|
||||
"height_cm": 180,
|
||||
"weight_kg": 75,
|
||||
"position": "SG",
|
||||
"games_played": 10,
|
||||
"minutes_per_game": 30.0,
|
||||
"points_per_game": 15.0,
|
||||
"rebounds_per_game": 4.0,
|
||||
"assists_per_game": 3.0,
|
||||
"steals_per_game": 1.2,
|
||||
"blocks_per_game": 0.4,
|
||||
"turnovers_per_game": 2.0,
|
||||
"fg_pct": 45.0,
|
||||
"three_pt_pct": 35.0,
|
||||
"ft_pct": 82.0,
|
||||
}
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
def __init__(self, payload, status_code=200):
|
||||
self._payload = payload
|
||||
self.status_code = status_code
|
||||
|
||||
def raise_for_status(self):
|
||||
if self.status_code >= 400:
|
||||
raise RuntimeError(f"status={self.status_code}")
|
||||
|
||||
def json(self):
|
||||
return self._payload
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_base_extractor_run_writes_snapshot_file(tmp_path, settings):
|
||||
settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming")
|
||||
extractor = DummyExtractor()
|
||||
result = extractor.run(snapshot_date=date(2026, 3, 13))
|
||||
|
||||
assert result.records_count == 1
|
||||
assert result.source_name == "dummy_source"
|
||||
assert result.output_path is not None
|
||||
assert result.output_path.exists()
|
||||
|
||||
payload = json.loads(result.output_path.read_text(encoding="utf-8"))
|
||||
assert payload["source_name"] == "dummy_source"
|
||||
assert payload["snapshot_date"] == "2026-03-13"
|
||||
assert payload["records"][0]["full_name"] == "Jane Doe"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_public_json_extractor_normalizes_common_field_aliases(tmp_path):
|
||||
class FakeClient:
|
||||
def get_json(self, *_args, **_kwargs):
|
||||
return {
|
||||
"records": [
|
||||
{
|
||||
"competition_id": 99,
|
||||
"competition_name": "National League",
|
||||
"season": 2025,
|
||||
"team_id": 10,
|
||||
"team_name": "Blue Team",
|
||||
"player_id": 123,
|
||||
"player_name": "John Smith",
|
||||
"first_name": "John",
|
||||
"last_name": "Smith",
|
||||
"birth_date": "2001-05-12",
|
||||
"nationality": "US",
|
||||
"height_cm": 198,
|
||||
"weight_kg": 96,
|
||||
"position": "SF",
|
||||
"gp": 20,
|
||||
"mpg": 28.5,
|
||||
"ppg": 14.2,
|
||||
"rpg": 5.1,
|
||||
"apg": 3.2,
|
||||
"spg": 1.1,
|
||||
"bpg": 0.5,
|
||||
"tov": 1.9,
|
||||
"fg_pct": 47.3,
|
||||
"three_pct": 36.1,
|
||||
"ft_pct": 80.0,
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
extractor = PublicJsonSnapshotExtractor(
|
||||
url="https://example.com/public-feed.json",
|
||||
source_name="test_public_feed",
|
||||
http_client=FakeClient(),
|
||||
)
|
||||
output_file = tmp_path / "public.json"
|
||||
result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
|
||||
|
||||
assert result.records_count == 1
|
||||
payload = json.loads(output_file.read_text(encoding="utf-8"))
|
||||
row = payload["records"][0]
|
||||
assert row["competition_external_id"] == "99"
|
||||
assert row["team_external_id"] == "10"
|
||||
assert row["player_external_id"] == "123"
|
||||
assert row["full_name"] == "John Smith"
|
||||
assert row["three_pt_pct"] == 36.1
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
|
||||
settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
|
||||
settings.EXTRACTOR_PUBLIC_SOURCE_NAME = "cmd_test_source"
|
||||
output_dir = tmp_path / "snapshots"
|
||||
|
||||
class FakeClient:
|
||||
def get_json(self, *_args, **_kwargs):
|
||||
return {
|
||||
"records": [
|
||||
{
|
||||
"competition_external_id": "comp-a",
|
||||
"competition_name": "Alpha League",
|
||||
"season": "2025-2026",
|
||||
"team_external_id": "team-a",
|
||||
"team_name": "Alpha Team",
|
||||
"player_external_id": "player-a",
|
||||
"full_name": "Alpha Player",
|
||||
"first_name": "Alpha",
|
||||
"last_name": "Player",
|
||||
"birth_date": "2000-04-01",
|
||||
"nationality": "US",
|
||||
"height_cm": 190,
|
||||
"weight_kg": 88,
|
||||
"position": "PG",
|
||||
"games_played": 12,
|
||||
"minutes_per_game": 31.0,
|
||||
"points_per_game": 17.0,
|
||||
"rebounds_per_game": 4.0,
|
||||
"assists_per_game": 6.0,
|
||||
"steals_per_game": 1.3,
|
||||
"blocks_per_game": 0.1,
|
||||
"turnovers_per_game": 2.4,
|
||||
"fg_pct": 44.0,
|
||||
"three_pt_pct": 37.0,
|
||||
"ft_pct": 79.0,
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
monkeypatch = pytest.MonkeyPatch()
|
||||
monkeypatch.setattr(
|
||||
"apps.ingestion.extractors.public_json.ResponsibleHttpClient",
|
||||
lambda **_kwargs: FakeClient(),
|
||||
)
|
||||
try:
|
||||
call_command(
|
||||
"run_extractor",
|
||||
"public_json_snapshot",
|
||||
"--output-path",
|
||||
str(output_dir),
|
||||
"--snapshot-date",
|
||||
"2026-03-13",
|
||||
)
|
||||
finally:
|
||||
monkeypatch.undo()
|
||||
|
||||
files = list(output_dir.glob("public_json_snapshot-2026-03-13.json"))
|
||||
assert len(files) == 1
|
||||
payload = json.loads(files[0].read_text(encoding="utf-8"))
|
||||
assert payload["source_name"] == "cmd_test_source"
|
||||
assert payload["records"][0]["full_name"] == "Alpha Player"
|
||||
|
||||
|
||||
def test_http_client_retries_on_retryable_status(monkeypatch):
|
||||
class FakeSession:
|
||||
def __init__(self):
|
||||
self.calls = 0
|
||||
|
||||
def get(self, *_args, **_kwargs):
|
||||
self.calls += 1
|
||||
if self.calls == 1:
|
||||
return _FakeResponse({"error": "busy"}, status_code=429)
|
||||
return _FakeResponse({"records": []}, status_code=200)
|
||||
|
||||
client = ResponsibleHttpClient(
|
||||
user_agent="test-agent",
|
||||
timeout_seconds=5,
|
||||
retries=1,
|
||||
retry_sleep_seconds=0,
|
||||
request_delay_seconds=0,
|
||||
session=FakeSession(),
|
||||
)
|
||||
payload = client.get_json("https://example.com/feed.json")
|
||||
assert payload == {"records": []}
|
||||
Reference in New Issue
Block a user