From 850e4de71b429ed210d3c54f6790ce75494f0717 Mon Sep 17 00:00:00 2001 From: Alfredo Di Stasio Date: Fri, 13 Mar 2026 14:24:54 +0100 Subject: [PATCH] feat(v2): add snapshot extractor framework and run command --- .env.example | 10 + README.md | 49 ++++ apps/ingestion/extractors/__init__.py | 22 ++ apps/ingestion/extractors/base.py | 150 ++++++++++++ apps/ingestion/extractors/http.py | 109 +++++++++ apps/ingestion/extractors/public_json.py | 130 ++++++++++ apps/ingestion/extractors/registry.py | 22 ++ .../management/commands/run_extractor.py | 63 +++++ config/settings/base.py | 19 ++ tests/test_extractors_framework.py | 222 ++++++++++++++++++ 10 files changed, 796 insertions(+) create mode 100644 apps/ingestion/extractors/__init__.py create mode 100644 apps/ingestion/extractors/base.py create mode 100644 apps/ingestion/extractors/http.py create mode 100644 apps/ingestion/extractors/public_json.py create mode 100644 apps/ingestion/extractors/registry.py create mode 100644 apps/ingestion/management/commands/run_extractor.py create mode 100644 tests/test_extractors_framework.py diff --git a/.env.example b/.env.example index d9a4cec..be47342 100644 --- a/.env.example +++ b/.env.example @@ -36,6 +36,16 @@ STATIC_DATASET_INCOMING_DIR=/app/snapshots/incoming STATIC_DATASET_ARCHIVE_DIR=/app/snapshots/archive STATIC_DATASET_FAILED_DIR=/app/snapshots/failed +# Extractor framework (fetch -> parse -> normalize -> emit snapshot) +EXTRACTOR_USER_AGENT=HoopScoutBot/2.0 (+https://younerd.org) +EXTRACTOR_HTTP_TIMEOUT_SECONDS=15 +EXTRACTOR_HTTP_RETRIES=2 +EXTRACTOR_RETRY_SLEEP_SECONDS=1.0 +EXTRACTOR_REQUEST_DELAY_SECONDS=0.5 +EXTRACTOR_PUBLIC_JSON_URL= +EXTRACTOR_PUBLIC_SOURCE_NAME=public_json_source +EXTRACTOR_INCLUDE_RAW_PAYLOAD=0 + # Future optional scheduler loop settings (not enabled in base v2 runtime) SCHEDULER_ENABLED=0 SCHEDULER_INTERVAL_SECONDS=900 diff --git a/README.md b/README.md index d8d0fec..7e6e1bb 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,55 @@ Import history is visible in Django admin: - `ImportRun` - `ImportFile` +## Extractor Framework (v2) + +v2 keeps extraction and import as two separate steps: + +1. **Extractors** fetch public source content and emit normalized JSON snapshots. +2. **Importer** (`import_snapshots`) validates and upserts those snapshots into PostgreSQL. + +Extractor pipeline: +- `fetch` (public endpoint/page requests with conservative HTTP behavior) +- `parse` (source-specific structure) +- `normalize` (map to HoopScout snapshot schema) +- `emit` (write JSON file to incoming directory or custom path) + +Built-in extractor in this phase: +- `public_json_snapshot` (generic JSON feed extractor for MVP usage) + +Run extractor: + +```bash +docker compose exec web python manage.py run_extractor public_json_snapshot +``` + +Run extractor with explicit output path (debugging): + +```bash +docker compose exec web python manage.py run_extractor public_json_snapshot --output-path /app/snapshots/incoming +``` + +Dry-run validation (no file write): + +```bash +docker compose exec web python manage.py run_extractor public_json_snapshot --dry-run +``` + +Extractor environment variables: +- `EXTRACTOR_USER_AGENT` +- `EXTRACTOR_HTTP_TIMEOUT_SECONDS` +- `EXTRACTOR_HTTP_RETRIES` +- `EXTRACTOR_RETRY_SLEEP_SECONDS` +- `EXTRACTOR_REQUEST_DELAY_SECONDS` +- `EXTRACTOR_PUBLIC_JSON_URL` +- `EXTRACTOR_PUBLIC_SOURCE_NAME` +- `EXTRACTOR_INCLUDE_RAW_PAYLOAD` + +Notes: +- extraction is intentionally low-frequency and uses retries conservatively +- only public pages/endpoints should be targeted +- emitted snapshots must match the same schema consumed by `import_snapshots` + ## Migration and Superuser Commands ```bash diff --git a/apps/ingestion/extractors/__init__.py b/apps/ingestion/extractors/__init__.py new file mode 100644 index 0000000..d459cd8 --- /dev/null +++ b/apps/ingestion/extractors/__init__.py @@ -0,0 +1,22 @@ +from .base import ( + BaseSnapshotExtractor, + ExtractionResult, + ExtractorConfigError, + ExtractorError, + ExtractorFetchError, + ExtractorNormalizationError, + ExtractorParseError, +) +from .registry import available_extractors, create_extractor + +__all__ = [ + "BaseSnapshotExtractor", + "ExtractionResult", + "ExtractorError", + "ExtractorConfigError", + "ExtractorFetchError", + "ExtractorParseError", + "ExtractorNormalizationError", + "available_extractors", + "create_extractor", +] diff --git a/apps/ingestion/extractors/base.py b/apps/ingestion/extractors/base.py new file mode 100644 index 0000000..96c6d84 --- /dev/null +++ b/apps/ingestion/extractors/base.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import json +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import date +from pathlib import Path +from typing import Any + +from django.conf import settings + +from apps.ingestion.snapshots import SnapshotSchemaValidator + +logger = logging.getLogger(__name__) + + +class ExtractorError(RuntimeError): + """Base extractor exception.""" + + +class ExtractorConfigError(ExtractorError): + """Raised when extractor configuration is invalid.""" + + +class ExtractorFetchError(ExtractorError): + """Raised when remote/source fetch fails.""" + + +class ExtractorParseError(ExtractorError): + """Raised when fetched content cannot be parsed.""" + + +class ExtractorNormalizationError(ExtractorError): + """Raised when source rows cannot be normalized.""" + + +@dataclass +class ExtractionResult: + extractor_name: str + source_name: str + snapshot_date: date + records_count: int + output_path: Path | None + + +class BaseSnapshotExtractor(ABC): + extractor_name = "base" + source_name = "unknown_source" + + @abstractmethod + def fetch(self) -> Any: + """Fetch source payload from a source endpoint/resource.""" + + @abstractmethod + def parse(self, payload: Any) -> list[dict[str, Any]]: + """Parse fetched payload into source-specific record dictionaries.""" + + @abstractmethod + def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]: + """Normalize a source record into HoopScout snapshot record shape.""" + + def resolve_snapshot_date(self) -> date: + return date.today() + + def normalize_records(self, source_records: list[dict[str, Any]]) -> list[dict[str, Any]]: + normalized: list[dict[str, Any]] = [] + for idx, row in enumerate(source_records): + if not isinstance(row, dict): + raise ExtractorNormalizationError(f"Parsed record at index {idx} must be an object.") + normalized.append(self.normalize_record(row)) + return normalized + + def build_snapshot(self, records: list[dict[str, Any]], snapshot_date: date) -> dict[str, Any]: + return { + "source_name": self.source_name, + "snapshot_date": snapshot_date.isoformat(), + "records": records, + } + + def default_output_dir(self) -> Path: + return Path(settings.STATIC_DATASET_INCOMING_DIR) + + def snapshot_filename(self, snapshot_date: date) -> str: + return f"{self.extractor_name}-{snapshot_date.isoformat()}.json" + + def emit_snapshot( + self, + snapshot: dict[str, Any], + *, + output_path: str | Path | None = None, + indent: int = 2, + ) -> Path: + if output_path is None: + destination = self.default_output_dir() + destination.mkdir(parents=True, exist_ok=True) + file_path = destination / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"])) + else: + target = Path(output_path) + if target.suffix.lower() == ".json": + target.parent.mkdir(parents=True, exist_ok=True) + file_path = target + else: + target.mkdir(parents=True, exist_ok=True) + file_path = target / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"])) + + file_path.write_text(json.dumps(snapshot, indent=indent, ensure_ascii=True), encoding="utf-8") + return file_path + + def run( + self, + *, + output_path: str | Path | None = None, + snapshot_date: date | None = None, + write_output: bool = True, + indent: int = 2, + ) -> ExtractionResult: + payload = self.fetch() + source_rows = self.parse(payload) + normalized_rows = self.normalize_records(source_rows) + resolved_snapshot_date = snapshot_date or self.resolve_snapshot_date() + snapshot = self.build_snapshot(normalized_rows, resolved_snapshot_date) + validated = SnapshotSchemaValidator.validate(snapshot) + snapshot["records"] = validated.records + + output_file: Path | None = None + if write_output: + output_file = self.emit_snapshot(snapshot, output_path=output_path, indent=indent) + logger.info( + "extractor_snapshot_written extractor=%s source=%s records=%s path=%s", + self.extractor_name, + validated.source_name, + len(validated.records), + output_file, + ) + else: + logger.info( + "extractor_snapshot_validated extractor=%s source=%s records=%s write_output=0", + self.extractor_name, + validated.source_name, + len(validated.records), + ) + + return ExtractionResult( + extractor_name=self.extractor_name, + source_name=validated.source_name, + snapshot_date=validated.snapshot_date, + records_count=len(validated.records), + output_path=output_file, + ) diff --git a/apps/ingestion/extractors/http.py b/apps/ingestion/extractors/http.py new file mode 100644 index 0000000..a25a677 --- /dev/null +++ b/apps/ingestion/extractors/http.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import logging +import time +from typing import Any + +import requests + +from .base import ExtractorFetchError + +logger = logging.getLogger(__name__) + + +class ResponsibleHttpClient: + """ + Small HTTP helper for public-source extraction: + - explicit User-Agent + - request timeout + - conservative retries + - low-frequency pacing (fixed delay between requests) + """ + + RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504} + + def __init__( + self, + *, + user_agent: str, + timeout_seconds: float, + retries: int, + retry_sleep_seconds: float, + request_delay_seconds: float, + session: requests.Session | None = None, + ): + self.user_agent = user_agent + self.timeout_seconds = timeout_seconds + self.retries = retries + self.retry_sleep_seconds = retry_sleep_seconds + self.request_delay_seconds = request_delay_seconds + self.session = session or requests.Session() + self._last_request_at: float | None = None + + def _pace(self) -> None: + if self.request_delay_seconds <= 0: + return + now = time.monotonic() + if self._last_request_at is None: + self._last_request_at = now + return + elapsed = now - self._last_request_at + remaining = self.request_delay_seconds - elapsed + if remaining > 0: + time.sleep(remaining) + self._last_request_at = time.monotonic() + + def get_json( + self, + url: str, + *, + params: dict[str, Any] | None = None, + headers: dict[str, str] | None = None, + ) -> Any: + merged_headers = {"User-Agent": self.user_agent} + if headers: + merged_headers.update(headers) + + attempts = self.retries + 1 + for attempt in range(1, attempts + 1): + try: + self._pace() + response = self.session.get( + url, + params=params, + headers=merged_headers, + timeout=self.timeout_seconds, + ) + if response.status_code in self.RETRYABLE_STATUS_CODES: + if attempt < attempts: + logger.warning( + "extractor_http_retryable_status status=%s url=%s attempt=%s/%s", + response.status_code, + url, + attempt, + attempts, + ) + time.sleep(self.retry_sleep_seconds) + continue + raise ExtractorFetchError( + f"Retryable status exhausted: status={response.status_code} url={url}" + ) + + response.raise_for_status() + return response.json() + except requests.RequestException as exc: + if attempt < attempts: + logger.warning( + "extractor_http_request_retry error=%s url=%s attempt=%s/%s", + exc, + url, + attempt, + attempts, + ) + time.sleep(self.retry_sleep_seconds) + continue + raise ExtractorFetchError(f"Request failed after retries: {exc}") from exc + except ValueError as exc: + raise ExtractorFetchError(f"Invalid JSON response from {url}: {exc}") from exc + + raise ExtractorFetchError(f"Unexpected retry loop exit for {url}") diff --git a/apps/ingestion/extractors/public_json.py b/apps/ingestion/extractors/public_json.py new file mode 100644 index 0000000..acb973d --- /dev/null +++ b/apps/ingestion/extractors/public_json.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +from typing import Any + +from django.conf import settings + +from .base import ( + BaseSnapshotExtractor, + ExtractorConfigError, + ExtractorNormalizationError, + ExtractorParseError, +) +from .http import ResponsibleHttpClient + + +def _first_non_empty(record: dict[str, Any], *keys: str) -> Any: + for key in keys: + if key in record and record[key] not in (None, ""): + return record[key] + return None + + +class PublicJsonSnapshotExtractor(BaseSnapshotExtractor): + """ + Generic public JSON extractor for MVP v2. + + This extractor is intentionally generic and lightweight: + - fetch from one public JSON endpoint + - parse list-like payloads + - normalize into HoopScout snapshot schema + """ + + extractor_name = "public_json_snapshot" + + def __init__( + self, + *, + url: str | None = None, + source_name: str | None = None, + include_raw_payload: bool | None = None, + http_client: ResponsibleHttpClient | None = None, + ): + self.url = (url or settings.EXTRACTOR_PUBLIC_JSON_URL).strip() + self.source_name = (source_name or settings.EXTRACTOR_PUBLIC_SOURCE_NAME).strip() + self.include_raw_payload = ( + settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD if include_raw_payload is None else include_raw_payload + ) + if not self.url: + raise ExtractorConfigError("EXTRACTOR_PUBLIC_JSON_URL is required for public_json_snapshot extractor.") + if not self.source_name: + raise ExtractorConfigError("EXTRACTOR_PUBLIC_SOURCE_NAME must not be empty.") + + self.http_client = http_client or ResponsibleHttpClient( + user_agent=settings.EXTRACTOR_USER_AGENT, + timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS, + retries=settings.EXTRACTOR_HTTP_RETRIES, + retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS, + request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS, + ) + + def fetch(self) -> Any: + return self.http_client.get_json(self.url) + + def parse(self, payload: Any) -> list[dict[str, Any]]: + if isinstance(payload, list): + return payload + if not isinstance(payload, dict): + raise ExtractorParseError("Fetched payload must be a JSON object or array.") + + rows = payload.get("records") + if isinstance(rows, list): + return rows + + data_rows = payload.get("data") + if isinstance(data_rows, list): + return data_rows + + raise ExtractorParseError("Payload must contain 'records' or 'data' list.") + + def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]: + normalized = { + "competition_external_id": _first_non_empty( + source_record, "competition_external_id", "competition_id", "league_id" + ), + "competition_name": _first_non_empty( + source_record, "competition_name", "competition", "league_name" + ), + "season": _first_non_empty(source_record, "season", "season_label", "season_name"), + "team_external_id": _first_non_empty(source_record, "team_external_id", "team_id"), + "team_name": _first_non_empty(source_record, "team_name", "team"), + "player_external_id": _first_non_empty(source_record, "player_external_id", "player_id"), + "full_name": _first_non_empty(source_record, "full_name", "player_name", "name"), + "first_name": _first_non_empty(source_record, "first_name"), + "last_name": _first_non_empty(source_record, "last_name"), + "birth_date": _first_non_empty(source_record, "birth_date"), + "nationality": _first_non_empty(source_record, "nationality", "nationality_code"), + "height_cm": _first_non_empty(source_record, "height_cm"), + "weight_kg": _first_non_empty(source_record, "weight_kg"), + "position": _first_non_empty(source_record, "position"), + "role": _first_non_empty(source_record, "role"), + "games_played": _first_non_empty(source_record, "games_played", "gp"), + "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"), + "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"), + "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"), + "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"), + "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"), + "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"), + "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"), + "fg_pct": _first_non_empty(source_record, "fg_pct"), + "three_pt_pct": _first_non_empty( + source_record, "three_pt_pct", "three_point_pct", "three_pct", "3p_pct" + ), + "ft_pct": _first_non_empty(source_record, "ft_pct"), + } + + missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + if missing: + raise ExtractorNormalizationError( + f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}" + ) + + normalized["season"] = str(normalized["season"]).strip() + normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip() + normalized["team_external_id"] = str(normalized["team_external_id"]).strip() + normalized["player_external_id"] = str(normalized["player_external_id"]).strip() + + if self.include_raw_payload: + normalized["raw_payload"] = source_record + + return normalized diff --git a/apps/ingestion/extractors/registry.py b/apps/ingestion/extractors/registry.py new file mode 100644 index 0000000..bd960dc --- /dev/null +++ b/apps/ingestion/extractors/registry.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from .base import BaseSnapshotExtractor, ExtractorConfigError +from .public_json import PublicJsonSnapshotExtractor + +EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = { + PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor, +} + + +def available_extractors() -> list[str]: + return sorted(EXTRACTOR_REGISTRY.keys()) + + +def create_extractor(extractor_name: str) -> BaseSnapshotExtractor: + try: + extractor_cls = EXTRACTOR_REGISTRY[extractor_name] + except KeyError as exc: + raise ExtractorConfigError( + f"Unknown extractor '{extractor_name}'. Available: {', '.join(available_extractors())}" + ) from exc + return extractor_cls() diff --git a/apps/ingestion/management/commands/run_extractor.py b/apps/ingestion/management/commands/run_extractor.py new file mode 100644 index 0000000..0afd77a --- /dev/null +++ b/apps/ingestion/management/commands/run_extractor.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from django.core.management.base import BaseCommand, CommandError +from django.utils.dateparse import parse_date + +from apps.ingestion.extractors import ExtractorError, available_extractors, create_extractor + + +class Command(BaseCommand): + help = "Run a snapshot extractor and emit importable JSON snapshots." + + def add_arguments(self, parser): + parser.add_argument("extractor_name", choices=available_extractors()) + parser.add_argument( + "--output-path", + dest="output_path", + default=None, + help="Directory or .json file path where snapshot should be written. Defaults to incoming dir.", + ) + parser.add_argument( + "--snapshot-date", + dest="snapshot_date", + default=None, + help="Override snapshot date in YYYY-MM-DD format.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Validate extraction/normalization without writing snapshot file.", + ) + parser.add_argument( + "--indent", + type=int, + default=2, + help="JSON indent level for emitted snapshots.", + ) + + def handle(self, *args, **options): + snapshot_date = None + if options["snapshot_date"]: + snapshot_date = parse_date(options["snapshot_date"]) + if snapshot_date is None: + raise CommandError("--snapshot-date must be YYYY-MM-DD.") + + try: + extractor = create_extractor(options["extractor_name"]) + result = extractor.run( + output_path=options["output_path"], + snapshot_date=snapshot_date, + write_output=not options["dry_run"], + indent=options["indent"], + ) + except ExtractorError as exc: + raise CommandError(str(exc)) from exc + + output_path = str(result.output_path) if result.output_path else "" + self.stdout.write( + self.style.SUCCESS( + f"Extractor {result.extractor_name} completed: " + f"source={result.source_name} date={result.snapshot_date} " + f"records={result.records_count} output={output_path}" + ) + ) diff --git a/config/settings/base.py b/config/settings/base.py index e5cd367..52015f0 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -156,6 +156,25 @@ STATIC_DATASET_FAILED_DIR = os.getenv( os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")), ) +# v2 extractor framework runtime settings. +EXTRACTOR_USER_AGENT = os.getenv("EXTRACTOR_USER_AGENT", "HoopScoutBot/2.0 (+https://younerd.org)") +EXTRACTOR_HTTP_TIMEOUT_SECONDS = float(os.getenv("EXTRACTOR_HTTP_TIMEOUT_SECONDS", "15")) +EXTRACTOR_HTTP_RETRIES = int(os.getenv("EXTRACTOR_HTTP_RETRIES", "2")) +EXTRACTOR_RETRY_SLEEP_SECONDS = float(os.getenv("EXTRACTOR_RETRY_SLEEP_SECONDS", "1.0")) +EXTRACTOR_REQUEST_DELAY_SECONDS = float(os.getenv("EXTRACTOR_REQUEST_DELAY_SECONDS", "0.5")) +EXTRACTOR_PUBLIC_JSON_URL = os.getenv("EXTRACTOR_PUBLIC_JSON_URL", "").strip() +EXTRACTOR_PUBLIC_SOURCE_NAME = os.getenv("EXTRACTOR_PUBLIC_SOURCE_NAME", "public_json_source").strip() +EXTRACTOR_INCLUDE_RAW_PAYLOAD = env_bool("EXTRACTOR_INCLUDE_RAW_PAYLOAD", False) + +if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0: + raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.") +if EXTRACTOR_HTTP_RETRIES < 0: + raise ImproperlyConfigured("EXTRACTOR_HTTP_RETRIES must be >= 0.") +if EXTRACTOR_RETRY_SLEEP_SECONDS < 0: + raise ImproperlyConfigured("EXTRACTOR_RETRY_SLEEP_SECONDS must be >= 0.") +if EXTRACTOR_REQUEST_DELAY_SECONDS < 0: + raise ImproperlyConfigured("EXTRACTOR_REQUEST_DELAY_SECONDS must be >= 0.") + # Optional scheduler command settings for future v2 snapshot jobs. SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False) SCHEDULER_INTERVAL_SECONDS = int(os.getenv("SCHEDULER_INTERVAL_SECONDS", "900")) diff --git a/tests/test_extractors_framework.py b/tests/test_extractors_framework.py new file mode 100644 index 0000000..cb0dd74 --- /dev/null +++ b/tests/test_extractors_framework.py @@ -0,0 +1,222 @@ +from __future__ import annotations + +import json +from datetime import date + +import pytest +from django.core.management import call_command + +from apps.ingestion.extractors.base import BaseSnapshotExtractor +from apps.ingestion.extractors.http import ResponsibleHttpClient +from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor + + +class DummyExtractor(BaseSnapshotExtractor): + extractor_name = "dummy" + source_name = "dummy_source" + + def fetch(self): + return {"rows": [{"name": "Jane Doe"}]} + + def parse(self, payload): + return payload["rows"] + + def normalize_record(self, source_record): + return { + "competition_external_id": "comp-1", + "competition_name": "League One", + "season": "2025-2026", + "team_external_id": "team-1", + "team_name": "Team One", + "player_external_id": "player-1", + "full_name": source_record["name"], + "first_name": "Jane", + "last_name": "Doe", + "birth_date": "2000-01-01", + "nationality": "US", + "height_cm": 180, + "weight_kg": 75, + "position": "SG", + "games_played": 10, + "minutes_per_game": 30.0, + "points_per_game": 15.0, + "rebounds_per_game": 4.0, + "assists_per_game": 3.0, + "steals_per_game": 1.2, + "blocks_per_game": 0.4, + "turnovers_per_game": 2.0, + "fg_pct": 45.0, + "three_pt_pct": 35.0, + "ft_pct": 82.0, + } + + +class _FakeResponse: + def __init__(self, payload, status_code=200): + self._payload = payload + self.status_code = status_code + + def raise_for_status(self): + if self.status_code >= 400: + raise RuntimeError(f"status={self.status_code}") + + def json(self): + return self._payload + + +@pytest.mark.django_db +def test_base_extractor_run_writes_snapshot_file(tmp_path, settings): + settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming") + extractor = DummyExtractor() + result = extractor.run(snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + assert result.source_name == "dummy_source" + assert result.output_path is not None + assert result.output_path.exists() + + payload = json.loads(result.output_path.read_text(encoding="utf-8")) + assert payload["source_name"] == "dummy_source" + assert payload["snapshot_date"] == "2026-03-13" + assert payload["records"][0]["full_name"] == "Jane Doe" + + +@pytest.mark.django_db +def test_public_json_extractor_normalizes_common_field_aliases(tmp_path): + class FakeClient: + def get_json(self, *_args, **_kwargs): + return { + "records": [ + { + "competition_id": 99, + "competition_name": "National League", + "season": 2025, + "team_id": 10, + "team_name": "Blue Team", + "player_id": 123, + "player_name": "John Smith", + "first_name": "John", + "last_name": "Smith", + "birth_date": "2001-05-12", + "nationality": "US", + "height_cm": 198, + "weight_kg": 96, + "position": "SF", + "gp": 20, + "mpg": 28.5, + "ppg": 14.2, + "rpg": 5.1, + "apg": 3.2, + "spg": 1.1, + "bpg": 0.5, + "tov": 1.9, + "fg_pct": 47.3, + "three_pct": 36.1, + "ft_pct": 80.0, + } + ] + } + + extractor = PublicJsonSnapshotExtractor( + url="https://example.com/public-feed.json", + source_name="test_public_feed", + http_client=FakeClient(), + ) + output_file = tmp_path / "public.json" + result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13)) + + assert result.records_count == 1 + payload = json.loads(output_file.read_text(encoding="utf-8")) + row = payload["records"][0] + assert row["competition_external_id"] == "99" + assert row["team_external_id"] == "10" + assert row["player_external_id"] == "123" + assert row["full_name"] == "John Smith" + assert row["three_pt_pct"] == 36.1 + + +@pytest.mark.django_db +def test_run_extractor_management_command_writes_snapshot(tmp_path, settings): + settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json" + settings.EXTRACTOR_PUBLIC_SOURCE_NAME = "cmd_test_source" + output_dir = tmp_path / "snapshots" + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return { + "records": [ + { + "competition_external_id": "comp-a", + "competition_name": "Alpha League", + "season": "2025-2026", + "team_external_id": "team-a", + "team_name": "Alpha Team", + "player_external_id": "player-a", + "full_name": "Alpha Player", + "first_name": "Alpha", + "last_name": "Player", + "birth_date": "2000-04-01", + "nationality": "US", + "height_cm": 190, + "weight_kg": 88, + "position": "PG", + "games_played": 12, + "minutes_per_game": 31.0, + "points_per_game": 17.0, + "rebounds_per_game": 4.0, + "assists_per_game": 6.0, + "steals_per_game": 1.3, + "blocks_per_game": 0.1, + "turnovers_per_game": 2.4, + "fg_pct": 44.0, + "three_pt_pct": 37.0, + "ft_pct": 79.0, + } + ] + } + + monkeypatch = pytest.MonkeyPatch() + monkeypatch.setattr( + "apps.ingestion.extractors.public_json.ResponsibleHttpClient", + lambda **_kwargs: FakeClient(), + ) + try: + call_command( + "run_extractor", + "public_json_snapshot", + "--output-path", + str(output_dir), + "--snapshot-date", + "2026-03-13", + ) + finally: + monkeypatch.undo() + + files = list(output_dir.glob("public_json_snapshot-2026-03-13.json")) + assert len(files) == 1 + payload = json.loads(files[0].read_text(encoding="utf-8")) + assert payload["source_name"] == "cmd_test_source" + assert payload["records"][0]["full_name"] == "Alpha Player" + + +def test_http_client_retries_on_retryable_status(monkeypatch): + class FakeSession: + def __init__(self): + self.calls = 0 + + def get(self, *_args, **_kwargs): + self.calls += 1 + if self.calls == 1: + return _FakeResponse({"error": "busy"}, status_code=429) + return _FakeResponse({"records": []}, status_code=200) + + client = ResponsibleHttpClient( + user_agent="test-agent", + timeout_seconds=5, + retries=1, + retry_sleep_seconds=0, + request_delay_seconds=0, + session=FakeSession(), + ) + payload = client.get_json("https://example.com/feed.json") + assert payload == {"records": []}