feat(v2): add snapshot extractor framework and run command

This commit is contained in:
Alfredo Di Stasio
2026-03-13 14:24:54 +01:00
parent 6fc583c79f
commit 850e4de71b
10 changed files with 796 additions and 0 deletions

View File

@ -36,6 +36,16 @@ STATIC_DATASET_INCOMING_DIR=/app/snapshots/incoming
STATIC_DATASET_ARCHIVE_DIR=/app/snapshots/archive
STATIC_DATASET_FAILED_DIR=/app/snapshots/failed
# Extractor framework (fetch -> parse -> normalize -> emit snapshot)
EXTRACTOR_USER_AGENT=HoopScoutBot/2.0 (+https://younerd.org)
EXTRACTOR_HTTP_TIMEOUT_SECONDS=15
EXTRACTOR_HTTP_RETRIES=2
EXTRACTOR_RETRY_SLEEP_SECONDS=1.0
EXTRACTOR_REQUEST_DELAY_SECONDS=0.5
EXTRACTOR_PUBLIC_JSON_URL=
EXTRACTOR_PUBLIC_SOURCE_NAME=public_json_source
EXTRACTOR_INCLUDE_RAW_PAYLOAD=0
# Future optional scheduler loop settings (not enabled in base v2 runtime)
SCHEDULER_ENABLED=0
SCHEDULER_INTERVAL_SECONDS=900

View File

@ -169,6 +169,55 @@ Import history is visible in Django admin:
- `ImportRun`
- `ImportFile`
## Extractor Framework (v2)
v2 keeps extraction and import as two separate steps:
1. **Extractors** fetch public source content and emit normalized JSON snapshots.
2. **Importer** (`import_snapshots`) validates and upserts those snapshots into PostgreSQL.
Extractor pipeline:
- `fetch` (public endpoint/page requests with conservative HTTP behavior)
- `parse` (source-specific structure)
- `normalize` (map to HoopScout snapshot schema)
- `emit` (write JSON file to incoming directory or custom path)
Built-in extractor in this phase:
- `public_json_snapshot` (generic JSON feed extractor for MVP usage)
Run extractor:
```bash
docker compose exec web python manage.py run_extractor public_json_snapshot
```
Run extractor with explicit output path (debugging):
```bash
docker compose exec web python manage.py run_extractor public_json_snapshot --output-path /app/snapshots/incoming
```
Dry-run validation (no file write):
```bash
docker compose exec web python manage.py run_extractor public_json_snapshot --dry-run
```
Extractor environment variables:
- `EXTRACTOR_USER_AGENT`
- `EXTRACTOR_HTTP_TIMEOUT_SECONDS`
- `EXTRACTOR_HTTP_RETRIES`
- `EXTRACTOR_RETRY_SLEEP_SECONDS`
- `EXTRACTOR_REQUEST_DELAY_SECONDS`
- `EXTRACTOR_PUBLIC_JSON_URL`
- `EXTRACTOR_PUBLIC_SOURCE_NAME`
- `EXTRACTOR_INCLUDE_RAW_PAYLOAD`
Notes:
- extraction is intentionally low-frequency and uses retries conservatively
- only public pages/endpoints should be targeted
- emitted snapshots must match the same schema consumed by `import_snapshots`
## Migration and Superuser Commands
```bash

View File

@ -0,0 +1,22 @@
from .base import (
BaseSnapshotExtractor,
ExtractionResult,
ExtractorConfigError,
ExtractorError,
ExtractorFetchError,
ExtractorNormalizationError,
ExtractorParseError,
)
from .registry import available_extractors, create_extractor
__all__ = [
"BaseSnapshotExtractor",
"ExtractionResult",
"ExtractorError",
"ExtractorConfigError",
"ExtractorFetchError",
"ExtractorParseError",
"ExtractorNormalizationError",
"available_extractors",
"create_extractor",
]

View File

@ -0,0 +1,150 @@
from __future__ import annotations
import json
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import date
from pathlib import Path
from typing import Any
from django.conf import settings
from apps.ingestion.snapshots import SnapshotSchemaValidator
logger = logging.getLogger(__name__)
class ExtractorError(RuntimeError):
"""Base extractor exception."""
class ExtractorConfigError(ExtractorError):
"""Raised when extractor configuration is invalid."""
class ExtractorFetchError(ExtractorError):
"""Raised when remote/source fetch fails."""
class ExtractorParseError(ExtractorError):
"""Raised when fetched content cannot be parsed."""
class ExtractorNormalizationError(ExtractorError):
"""Raised when source rows cannot be normalized."""
@dataclass
class ExtractionResult:
extractor_name: str
source_name: str
snapshot_date: date
records_count: int
output_path: Path | None
class BaseSnapshotExtractor(ABC):
extractor_name = "base"
source_name = "unknown_source"
@abstractmethod
def fetch(self) -> Any:
"""Fetch source payload from a source endpoint/resource."""
@abstractmethod
def parse(self, payload: Any) -> list[dict[str, Any]]:
"""Parse fetched payload into source-specific record dictionaries."""
@abstractmethod
def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
"""Normalize a source record into HoopScout snapshot record shape."""
def resolve_snapshot_date(self) -> date:
return date.today()
def normalize_records(self, source_records: list[dict[str, Any]]) -> list[dict[str, Any]]:
normalized: list[dict[str, Any]] = []
for idx, row in enumerate(source_records):
if not isinstance(row, dict):
raise ExtractorNormalizationError(f"Parsed record at index {idx} must be an object.")
normalized.append(self.normalize_record(row))
return normalized
def build_snapshot(self, records: list[dict[str, Any]], snapshot_date: date) -> dict[str, Any]:
return {
"source_name": self.source_name,
"snapshot_date": snapshot_date.isoformat(),
"records": records,
}
def default_output_dir(self) -> Path:
return Path(settings.STATIC_DATASET_INCOMING_DIR)
def snapshot_filename(self, snapshot_date: date) -> str:
return f"{self.extractor_name}-{snapshot_date.isoformat()}.json"
def emit_snapshot(
self,
snapshot: dict[str, Any],
*,
output_path: str | Path | None = None,
indent: int = 2,
) -> Path:
if output_path is None:
destination = self.default_output_dir()
destination.mkdir(parents=True, exist_ok=True)
file_path = destination / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"]))
else:
target = Path(output_path)
if target.suffix.lower() == ".json":
target.parent.mkdir(parents=True, exist_ok=True)
file_path = target
else:
target.mkdir(parents=True, exist_ok=True)
file_path = target / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"]))
file_path.write_text(json.dumps(snapshot, indent=indent, ensure_ascii=True), encoding="utf-8")
return file_path
def run(
self,
*,
output_path: str | Path | None = None,
snapshot_date: date | None = None,
write_output: bool = True,
indent: int = 2,
) -> ExtractionResult:
payload = self.fetch()
source_rows = self.parse(payload)
normalized_rows = self.normalize_records(source_rows)
resolved_snapshot_date = snapshot_date or self.resolve_snapshot_date()
snapshot = self.build_snapshot(normalized_rows, resolved_snapshot_date)
validated = SnapshotSchemaValidator.validate(snapshot)
snapshot["records"] = validated.records
output_file: Path | None = None
if write_output:
output_file = self.emit_snapshot(snapshot, output_path=output_path, indent=indent)
logger.info(
"extractor_snapshot_written extractor=%s source=%s records=%s path=%s",
self.extractor_name,
validated.source_name,
len(validated.records),
output_file,
)
else:
logger.info(
"extractor_snapshot_validated extractor=%s source=%s records=%s write_output=0",
self.extractor_name,
validated.source_name,
len(validated.records),
)
return ExtractionResult(
extractor_name=self.extractor_name,
source_name=validated.source_name,
snapshot_date=validated.snapshot_date,
records_count=len(validated.records),
output_path=output_file,
)

View File

@ -0,0 +1,109 @@
from __future__ import annotations
import logging
import time
from typing import Any
import requests
from .base import ExtractorFetchError
logger = logging.getLogger(__name__)
class ResponsibleHttpClient:
"""
Small HTTP helper for public-source extraction:
- explicit User-Agent
- request timeout
- conservative retries
- low-frequency pacing (fixed delay between requests)
"""
RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
def __init__(
self,
*,
user_agent: str,
timeout_seconds: float,
retries: int,
retry_sleep_seconds: float,
request_delay_seconds: float,
session: requests.Session | None = None,
):
self.user_agent = user_agent
self.timeout_seconds = timeout_seconds
self.retries = retries
self.retry_sleep_seconds = retry_sleep_seconds
self.request_delay_seconds = request_delay_seconds
self.session = session or requests.Session()
self._last_request_at: float | None = None
def _pace(self) -> None:
if self.request_delay_seconds <= 0:
return
now = time.monotonic()
if self._last_request_at is None:
self._last_request_at = now
return
elapsed = now - self._last_request_at
remaining = self.request_delay_seconds - elapsed
if remaining > 0:
time.sleep(remaining)
self._last_request_at = time.monotonic()
def get_json(
self,
url: str,
*,
params: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
) -> Any:
merged_headers = {"User-Agent": self.user_agent}
if headers:
merged_headers.update(headers)
attempts = self.retries + 1
for attempt in range(1, attempts + 1):
try:
self._pace()
response = self.session.get(
url,
params=params,
headers=merged_headers,
timeout=self.timeout_seconds,
)
if response.status_code in self.RETRYABLE_STATUS_CODES:
if attempt < attempts:
logger.warning(
"extractor_http_retryable_status status=%s url=%s attempt=%s/%s",
response.status_code,
url,
attempt,
attempts,
)
time.sleep(self.retry_sleep_seconds)
continue
raise ExtractorFetchError(
f"Retryable status exhausted: status={response.status_code} url={url}"
)
response.raise_for_status()
return response.json()
except requests.RequestException as exc:
if attempt < attempts:
logger.warning(
"extractor_http_request_retry error=%s url=%s attempt=%s/%s",
exc,
url,
attempt,
attempts,
)
time.sleep(self.retry_sleep_seconds)
continue
raise ExtractorFetchError(f"Request failed after retries: {exc}") from exc
except ValueError as exc:
raise ExtractorFetchError(f"Invalid JSON response from {url}: {exc}") from exc
raise ExtractorFetchError(f"Unexpected retry loop exit for {url}")

View File

@ -0,0 +1,130 @@
from __future__ import annotations
from typing import Any
from django.conf import settings
from .base import (
BaseSnapshotExtractor,
ExtractorConfigError,
ExtractorNormalizationError,
ExtractorParseError,
)
from .http import ResponsibleHttpClient
def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
for key in keys:
if key in record and record[key] not in (None, ""):
return record[key]
return None
class PublicJsonSnapshotExtractor(BaseSnapshotExtractor):
"""
Generic public JSON extractor for MVP v2.
This extractor is intentionally generic and lightweight:
- fetch from one public JSON endpoint
- parse list-like payloads
- normalize into HoopScout snapshot schema
"""
extractor_name = "public_json_snapshot"
def __init__(
self,
*,
url: str | None = None,
source_name: str | None = None,
include_raw_payload: bool | None = None,
http_client: ResponsibleHttpClient | None = None,
):
self.url = (url or settings.EXTRACTOR_PUBLIC_JSON_URL).strip()
self.source_name = (source_name or settings.EXTRACTOR_PUBLIC_SOURCE_NAME).strip()
self.include_raw_payload = (
settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD if include_raw_payload is None else include_raw_payload
)
if not self.url:
raise ExtractorConfigError("EXTRACTOR_PUBLIC_JSON_URL is required for public_json_snapshot extractor.")
if not self.source_name:
raise ExtractorConfigError("EXTRACTOR_PUBLIC_SOURCE_NAME must not be empty.")
self.http_client = http_client or ResponsibleHttpClient(
user_agent=settings.EXTRACTOR_USER_AGENT,
timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS,
retries=settings.EXTRACTOR_HTTP_RETRIES,
retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS,
request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS,
)
def fetch(self) -> Any:
return self.http_client.get_json(self.url)
def parse(self, payload: Any) -> list[dict[str, Any]]:
if isinstance(payload, list):
return payload
if not isinstance(payload, dict):
raise ExtractorParseError("Fetched payload must be a JSON object or array.")
rows = payload.get("records")
if isinstance(rows, list):
return rows
data_rows = payload.get("data")
if isinstance(data_rows, list):
return data_rows
raise ExtractorParseError("Payload must contain 'records' or 'data' list.")
def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
normalized = {
"competition_external_id": _first_non_empty(
source_record, "competition_external_id", "competition_id", "league_id"
),
"competition_name": _first_non_empty(
source_record, "competition_name", "competition", "league_name"
),
"season": _first_non_empty(source_record, "season", "season_label", "season_name"),
"team_external_id": _first_non_empty(source_record, "team_external_id", "team_id"),
"team_name": _first_non_empty(source_record, "team_name", "team"),
"player_external_id": _first_non_empty(source_record, "player_external_id", "player_id"),
"full_name": _first_non_empty(source_record, "full_name", "player_name", "name"),
"first_name": _first_non_empty(source_record, "first_name"),
"last_name": _first_non_empty(source_record, "last_name"),
"birth_date": _first_non_empty(source_record, "birth_date"),
"nationality": _first_non_empty(source_record, "nationality", "nationality_code"),
"height_cm": _first_non_empty(source_record, "height_cm"),
"weight_kg": _first_non_empty(source_record, "weight_kg"),
"position": _first_non_empty(source_record, "position"),
"role": _first_non_empty(source_record, "role"),
"games_played": _first_non_empty(source_record, "games_played", "gp"),
"minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"),
"points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"),
"rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"),
"assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"),
"steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"),
"blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"),
"turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"),
"fg_pct": _first_non_empty(source_record, "fg_pct"),
"three_pt_pct": _first_non_empty(
source_record, "three_pt_pct", "three_point_pct", "three_pct", "3p_pct"
),
"ft_pct": _first_non_empty(source_record, "ft_pct"),
}
missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
if missing:
raise ExtractorNormalizationError(
f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}"
)
normalized["season"] = str(normalized["season"]).strip()
normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip()
normalized["team_external_id"] = str(normalized["team_external_id"]).strip()
normalized["player_external_id"] = str(normalized["player_external_id"]).strip()
if self.include_raw_payload:
normalized["raw_payload"] = source_record
return normalized

View File

@ -0,0 +1,22 @@
from __future__ import annotations
from .base import BaseSnapshotExtractor, ExtractorConfigError
from .public_json import PublicJsonSnapshotExtractor
EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = {
PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor,
}
def available_extractors() -> list[str]:
return sorted(EXTRACTOR_REGISTRY.keys())
def create_extractor(extractor_name: str) -> BaseSnapshotExtractor:
try:
extractor_cls = EXTRACTOR_REGISTRY[extractor_name]
except KeyError as exc:
raise ExtractorConfigError(
f"Unknown extractor '{extractor_name}'. Available: {', '.join(available_extractors())}"
) from exc
return extractor_cls()

View File

@ -0,0 +1,63 @@
from __future__ import annotations
from django.core.management.base import BaseCommand, CommandError
from django.utils.dateparse import parse_date
from apps.ingestion.extractors import ExtractorError, available_extractors, create_extractor
class Command(BaseCommand):
help = "Run a snapshot extractor and emit importable JSON snapshots."
def add_arguments(self, parser):
parser.add_argument("extractor_name", choices=available_extractors())
parser.add_argument(
"--output-path",
dest="output_path",
default=None,
help="Directory or .json file path where snapshot should be written. Defaults to incoming dir.",
)
parser.add_argument(
"--snapshot-date",
dest="snapshot_date",
default=None,
help="Override snapshot date in YYYY-MM-DD format.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Validate extraction/normalization without writing snapshot file.",
)
parser.add_argument(
"--indent",
type=int,
default=2,
help="JSON indent level for emitted snapshots.",
)
def handle(self, *args, **options):
snapshot_date = None
if options["snapshot_date"]:
snapshot_date = parse_date(options["snapshot_date"])
if snapshot_date is None:
raise CommandError("--snapshot-date must be YYYY-MM-DD.")
try:
extractor = create_extractor(options["extractor_name"])
result = extractor.run(
output_path=options["output_path"],
snapshot_date=snapshot_date,
write_output=not options["dry_run"],
indent=options["indent"],
)
except ExtractorError as exc:
raise CommandError(str(exc)) from exc
output_path = str(result.output_path) if result.output_path else "<dry-run>"
self.stdout.write(
self.style.SUCCESS(
f"Extractor {result.extractor_name} completed: "
f"source={result.source_name} date={result.snapshot_date} "
f"records={result.records_count} output={output_path}"
)
)

View File

@ -156,6 +156,25 @@ STATIC_DATASET_FAILED_DIR = os.getenv(
os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")),
)
# v2 extractor framework runtime settings.
EXTRACTOR_USER_AGENT = os.getenv("EXTRACTOR_USER_AGENT", "HoopScoutBot/2.0 (+https://younerd.org)")
EXTRACTOR_HTTP_TIMEOUT_SECONDS = float(os.getenv("EXTRACTOR_HTTP_TIMEOUT_SECONDS", "15"))
EXTRACTOR_HTTP_RETRIES = int(os.getenv("EXTRACTOR_HTTP_RETRIES", "2"))
EXTRACTOR_RETRY_SLEEP_SECONDS = float(os.getenv("EXTRACTOR_RETRY_SLEEP_SECONDS", "1.0"))
EXTRACTOR_REQUEST_DELAY_SECONDS = float(os.getenv("EXTRACTOR_REQUEST_DELAY_SECONDS", "0.5"))
EXTRACTOR_PUBLIC_JSON_URL = os.getenv("EXTRACTOR_PUBLIC_JSON_URL", "").strip()
EXTRACTOR_PUBLIC_SOURCE_NAME = os.getenv("EXTRACTOR_PUBLIC_SOURCE_NAME", "public_json_source").strip()
EXTRACTOR_INCLUDE_RAW_PAYLOAD = env_bool("EXTRACTOR_INCLUDE_RAW_PAYLOAD", False)
if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0:
raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.")
if EXTRACTOR_HTTP_RETRIES < 0:
raise ImproperlyConfigured("EXTRACTOR_HTTP_RETRIES must be >= 0.")
if EXTRACTOR_RETRY_SLEEP_SECONDS < 0:
raise ImproperlyConfigured("EXTRACTOR_RETRY_SLEEP_SECONDS must be >= 0.")
if EXTRACTOR_REQUEST_DELAY_SECONDS < 0:
raise ImproperlyConfigured("EXTRACTOR_REQUEST_DELAY_SECONDS must be >= 0.")
# Optional scheduler command settings for future v2 snapshot jobs.
SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False)
SCHEDULER_INTERVAL_SECONDS = int(os.getenv("SCHEDULER_INTERVAL_SECONDS", "900"))

View File

@ -0,0 +1,222 @@
from __future__ import annotations
import json
from datetime import date
import pytest
from django.core.management import call_command
from apps.ingestion.extractors.base import BaseSnapshotExtractor
from apps.ingestion.extractors.http import ResponsibleHttpClient
from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor
class DummyExtractor(BaseSnapshotExtractor):
extractor_name = "dummy"
source_name = "dummy_source"
def fetch(self):
return {"rows": [{"name": "Jane Doe"}]}
def parse(self, payload):
return payload["rows"]
def normalize_record(self, source_record):
return {
"competition_external_id": "comp-1",
"competition_name": "League One",
"season": "2025-2026",
"team_external_id": "team-1",
"team_name": "Team One",
"player_external_id": "player-1",
"full_name": source_record["name"],
"first_name": "Jane",
"last_name": "Doe",
"birth_date": "2000-01-01",
"nationality": "US",
"height_cm": 180,
"weight_kg": 75,
"position": "SG",
"games_played": 10,
"minutes_per_game": 30.0,
"points_per_game": 15.0,
"rebounds_per_game": 4.0,
"assists_per_game": 3.0,
"steals_per_game": 1.2,
"blocks_per_game": 0.4,
"turnovers_per_game": 2.0,
"fg_pct": 45.0,
"three_pt_pct": 35.0,
"ft_pct": 82.0,
}
class _FakeResponse:
def __init__(self, payload, status_code=200):
self._payload = payload
self.status_code = status_code
def raise_for_status(self):
if self.status_code >= 400:
raise RuntimeError(f"status={self.status_code}")
def json(self):
return self._payload
@pytest.mark.django_db
def test_base_extractor_run_writes_snapshot_file(tmp_path, settings):
settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming")
extractor = DummyExtractor()
result = extractor.run(snapshot_date=date(2026, 3, 13))
assert result.records_count == 1
assert result.source_name == "dummy_source"
assert result.output_path is not None
assert result.output_path.exists()
payload = json.loads(result.output_path.read_text(encoding="utf-8"))
assert payload["source_name"] == "dummy_source"
assert payload["snapshot_date"] == "2026-03-13"
assert payload["records"][0]["full_name"] == "Jane Doe"
@pytest.mark.django_db
def test_public_json_extractor_normalizes_common_field_aliases(tmp_path):
class FakeClient:
def get_json(self, *_args, **_kwargs):
return {
"records": [
{
"competition_id": 99,
"competition_name": "National League",
"season": 2025,
"team_id": 10,
"team_name": "Blue Team",
"player_id": 123,
"player_name": "John Smith",
"first_name": "John",
"last_name": "Smith",
"birth_date": "2001-05-12",
"nationality": "US",
"height_cm": 198,
"weight_kg": 96,
"position": "SF",
"gp": 20,
"mpg": 28.5,
"ppg": 14.2,
"rpg": 5.1,
"apg": 3.2,
"spg": 1.1,
"bpg": 0.5,
"tov": 1.9,
"fg_pct": 47.3,
"three_pct": 36.1,
"ft_pct": 80.0,
}
]
}
extractor = PublicJsonSnapshotExtractor(
url="https://example.com/public-feed.json",
source_name="test_public_feed",
http_client=FakeClient(),
)
output_file = tmp_path / "public.json"
result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
assert result.records_count == 1
payload = json.loads(output_file.read_text(encoding="utf-8"))
row = payload["records"][0]
assert row["competition_external_id"] == "99"
assert row["team_external_id"] == "10"
assert row["player_external_id"] == "123"
assert row["full_name"] == "John Smith"
assert row["three_pt_pct"] == 36.1
@pytest.mark.django_db
def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
settings.EXTRACTOR_PUBLIC_SOURCE_NAME = "cmd_test_source"
output_dir = tmp_path / "snapshots"
class FakeClient:
def get_json(self, *_args, **_kwargs):
return {
"records": [
{
"competition_external_id": "comp-a",
"competition_name": "Alpha League",
"season": "2025-2026",
"team_external_id": "team-a",
"team_name": "Alpha Team",
"player_external_id": "player-a",
"full_name": "Alpha Player",
"first_name": "Alpha",
"last_name": "Player",
"birth_date": "2000-04-01",
"nationality": "US",
"height_cm": 190,
"weight_kg": 88,
"position": "PG",
"games_played": 12,
"minutes_per_game": 31.0,
"points_per_game": 17.0,
"rebounds_per_game": 4.0,
"assists_per_game": 6.0,
"steals_per_game": 1.3,
"blocks_per_game": 0.1,
"turnovers_per_game": 2.4,
"fg_pct": 44.0,
"three_pt_pct": 37.0,
"ft_pct": 79.0,
}
]
}
monkeypatch = pytest.MonkeyPatch()
monkeypatch.setattr(
"apps.ingestion.extractors.public_json.ResponsibleHttpClient",
lambda **_kwargs: FakeClient(),
)
try:
call_command(
"run_extractor",
"public_json_snapshot",
"--output-path",
str(output_dir),
"--snapshot-date",
"2026-03-13",
)
finally:
monkeypatch.undo()
files = list(output_dir.glob("public_json_snapshot-2026-03-13.json"))
assert len(files) == 1
payload = json.loads(files[0].read_text(encoding="utf-8"))
assert payload["source_name"] == "cmd_test_source"
assert payload["records"][0]["full_name"] == "Alpha Player"
def test_http_client_retries_on_retryable_status(monkeypatch):
class FakeSession:
def __init__(self):
self.calls = 0
def get(self, *_args, **_kwargs):
self.calls += 1
if self.calls == 1:
return _FakeResponse({"error": "busy"}, status_code=429)
return _FakeResponse({"records": []}, status_code=200)
client = ResponsibleHttpClient(
user_agent="test-agent",
timeout_seconds=5,
retries=1,
retry_sleep_seconds=0,
request_delay_seconds=0,
session=FakeSession(),
)
payload = client.get_json("https://example.com/feed.json")
assert payload == {"records": []}