feat(v2): add snapshot extractor framework and run command

2026-03-13 14:24:54 +01:00
parent 6fc583c79f
commit 850e4de71b
10 changed files with 796 additions and 0 deletions
--- a/apps/ingestion/extractors/init.py
+++ b/apps/ingestion/extractors/init.py
@@ -0,0 +1,22 @@
+from .base import (
+    BaseSnapshotExtractor,
+    ExtractionResult,
+    ExtractorConfigError,
+    ExtractorError,
+    ExtractorFetchError,
+    ExtractorNormalizationError,
+    ExtractorParseError,
+)
+from .registry import available_extractors, create_extractor
+
+__all__ = [
+    "BaseSnapshotExtractor",
+    "ExtractionResult",
+    "ExtractorError",
+    "ExtractorConfigError",
+    "ExtractorFetchError",
+    "ExtractorParseError",
+    "ExtractorNormalizationError",
+    "available_extractors",
+    "create_extractor",
+]
--- a/apps/ingestion/extractors/base.py
+++ b/apps/ingestion/extractors/base.py
@@ -0,0 +1,150 @@
+from __future__ import annotations
+
+import json
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+from django.conf import settings
+
+from apps.ingestion.snapshots import SnapshotSchemaValidator
+
+logger = logging.getLogger(__name__)
+
+
+class ExtractorError(RuntimeError):
+    """Base extractor exception."""
+
+
+class ExtractorConfigError(ExtractorError):
+    """Raised when extractor configuration is invalid."""
+
+
+class ExtractorFetchError(ExtractorError):
+    """Raised when remote/source fetch fails."""
+
+
+class ExtractorParseError(ExtractorError):
+    """Raised when fetched content cannot be parsed."""
+
+
+class ExtractorNormalizationError(ExtractorError):
+    """Raised when source rows cannot be normalized."""
+
+
+@dataclass
+class ExtractionResult:
+    extractor_name: str
+    source_name: str
+    snapshot_date: date
+    records_count: int
+    output_path: Path | None
+
+
+class BaseSnapshotExtractor(ABC):
+    extractor_name = "base"
+    source_name = "unknown_source"
+
+    @abstractmethod
+    def fetch(self) -> Any:
+        """Fetch source payload from a source endpoint/resource."""
+
+    @abstractmethod
+    def parse(self, payload: Any) -> list[dict[str, Any]]:
+        """Parse fetched payload into source-specific record dictionaries."""
+
+    @abstractmethod
+    def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
+        """Normalize a source record into HoopScout snapshot record shape."""
+
+    def resolve_snapshot_date(self) -> date:
+        return date.today()
+
+    def normalize_records(self, source_records: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        normalized: list[dict[str, Any]] = []
+        for idx, row in enumerate(source_records):
+            if not isinstance(row, dict):
+                raise ExtractorNormalizationError(f"Parsed record at index {idx} must be an object.")
+            normalized.append(self.normalize_record(row))
+        return normalized
+
+    def build_snapshot(self, records: list[dict[str, Any]], snapshot_date: date) -> dict[str, Any]:
+        return {
+            "source_name": self.source_name,
+            "snapshot_date": snapshot_date.isoformat(),
+            "records": records,
+        }
+
+    def default_output_dir(self) -> Path:
+        return Path(settings.STATIC_DATASET_INCOMING_DIR)
+
+    def snapshot_filename(self, snapshot_date: date) -> str:
+        return f"{self.extractor_name}-{snapshot_date.isoformat()}.json"
+
+    def emit_snapshot(
+        self,
+        snapshot: dict[str, Any],
+        *,
+        output_path: str | Path | None = None,
+        indent: int = 2,
+    ) -> Path:
+        if output_path is None:
+            destination = self.default_output_dir()
+            destination.mkdir(parents=True, exist_ok=True)
+            file_path = destination / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"]))
+        else:
+            target = Path(output_path)
+            if target.suffix.lower() == ".json":
+                target.parent.mkdir(parents=True, exist_ok=True)
+                file_path = target
+            else:
+                target.mkdir(parents=True, exist_ok=True)
+                file_path = target / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"]))
+
+        file_path.write_text(json.dumps(snapshot, indent=indent, ensure_ascii=True), encoding="utf-8")
+        return file_path
+
+    def run(
+        self,
+        *,
+        output_path: str | Path | None = None,
+        snapshot_date: date | None = None,
+        write_output: bool = True,
+        indent: int = 2,
+    ) -> ExtractionResult:
+        payload = self.fetch()
+        source_rows = self.parse(payload)
+        normalized_rows = self.normalize_records(source_rows)
+        resolved_snapshot_date = snapshot_date or self.resolve_snapshot_date()
+        snapshot = self.build_snapshot(normalized_rows, resolved_snapshot_date)
+        validated = SnapshotSchemaValidator.validate(snapshot)
+        snapshot["records"] = validated.records
+
+        output_file: Path | None = None
+        if write_output:
+            output_file = self.emit_snapshot(snapshot, output_path=output_path, indent=indent)
+            logger.info(
+                "extractor_snapshot_written extractor=%s source=%s records=%s path=%s",
+                self.extractor_name,
+                validated.source_name,
+                len(validated.records),
+                output_file,
+            )
+        else:
+            logger.info(
+                "extractor_snapshot_validated extractor=%s source=%s records=%s write_output=0",
+                self.extractor_name,
+                validated.source_name,
+                len(validated.records),
+            )
+
+        return ExtractionResult(
+            extractor_name=self.extractor_name,
+            source_name=validated.source_name,
+            snapshot_date=validated.snapshot_date,
+            records_count=len(validated.records),
+            output_path=output_file,
+        )
--- a/apps/ingestion/extractors/http.py
+++ b/apps/ingestion/extractors/http.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any
+
+import requests
+
+from .base import ExtractorFetchError
+
+logger = logging.getLogger(__name__)
+
+
+class ResponsibleHttpClient:
+    """
+    Small HTTP helper for public-source extraction:
+    - explicit User-Agent
+    - request timeout
+    - conservative retries
+    - low-frequency pacing (fixed delay between requests)
+    """
+
+    RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
+
+    def __init__(
+        self,
+        *,
+        user_agent: str,
+        timeout_seconds: float,
+        retries: int,
+        retry_sleep_seconds: float,
+        request_delay_seconds: float,
+        session: requests.Session | None = None,
+    ):
+        self.user_agent = user_agent
+        self.timeout_seconds = timeout_seconds
+        self.retries = retries
+        self.retry_sleep_seconds = retry_sleep_seconds
+        self.request_delay_seconds = request_delay_seconds
+        self.session = session or requests.Session()
+        self._last_request_at: float | None = None
+
+    def _pace(self) -> None:
+        if self.request_delay_seconds <= 0:
+            return
+        now = time.monotonic()
+        if self._last_request_at is None:
+            self._last_request_at = now
+            return
+        elapsed = now - self._last_request_at
+        remaining = self.request_delay_seconds - elapsed
+        if remaining > 0:
+            time.sleep(remaining)
+        self._last_request_at = time.monotonic()
+
+    def get_json(
+        self,
+        url: str,
+        *,
+        params: dict[str, Any] | None = None,
+        headers: dict[str, str] | None = None,
+    ) -> Any:
+        merged_headers = {"User-Agent": self.user_agent}
+        if headers:
+            merged_headers.update(headers)
+
+        attempts = self.retries + 1
+        for attempt in range(1, attempts + 1):
+            try:
+                self._pace()
+                response = self.session.get(
+                    url,
+                    params=params,
+                    headers=merged_headers,
+                    timeout=self.timeout_seconds,
+                )
+                if response.status_code in self.RETRYABLE_STATUS_CODES:
+                    if attempt < attempts:
+                        logger.warning(
+                            "extractor_http_retryable_status status=%s url=%s attempt=%s/%s",
+                            response.status_code,
+                            url,
+                            attempt,
+                            attempts,
+                        )
+                        time.sleep(self.retry_sleep_seconds)
+                        continue
+                    raise ExtractorFetchError(
+                        f"Retryable status exhausted: status={response.status_code} url={url}"
+                    )
+
+                response.raise_for_status()
+                return response.json()
+            except requests.RequestException as exc:
+                if attempt < attempts:
+                    logger.warning(
+                        "extractor_http_request_retry error=%s url=%s attempt=%s/%s",
+                        exc,
+                        url,
+                        attempt,
+                        attempts,
+                    )
+                    time.sleep(self.retry_sleep_seconds)
+                    continue
+                raise ExtractorFetchError(f"Request failed after retries: {exc}") from exc
+            except ValueError as exc:
+                raise ExtractorFetchError(f"Invalid JSON response from {url}: {exc}") from exc
+
+        raise ExtractorFetchError(f"Unexpected retry loop exit for {url}")
--- a/apps/ingestion/extractors/public_json.py
+++ b/apps/ingestion/extractors/public_json.py
@@ -0,0 +1,130 @@
+from __future__ import annotations
+
+from typing import Any
+
+from django.conf import settings
+
+from .base import (
+    BaseSnapshotExtractor,
+    ExtractorConfigError,
+    ExtractorNormalizationError,
+    ExtractorParseError,
+)
+from .http import ResponsibleHttpClient
+
+
+def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        if key in record and record[key] not in (None, ""):
+            return record[key]
+    return None
+
+
+class PublicJsonSnapshotExtractor(BaseSnapshotExtractor):
+    """
+    Generic public JSON extractor for MVP v2.
+
+    This extractor is intentionally generic and lightweight:
+    - fetch from one public JSON endpoint
+    - parse list-like payloads
+    - normalize into HoopScout snapshot schema
+    """
+
+    extractor_name = "public_json_snapshot"
+
+    def __init__(
+        self,
+        *,
+        url: str | None = None,
+        source_name: str | None = None,
+        include_raw_payload: bool | None = None,
+        http_client: ResponsibleHttpClient | None = None,
+    ):
+        self.url = (url or settings.EXTRACTOR_PUBLIC_JSON_URL).strip()
+        self.source_name = (source_name or settings.EXTRACTOR_PUBLIC_SOURCE_NAME).strip()
+        self.include_raw_payload = (
+            settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD if include_raw_payload is None else include_raw_payload
+        )
+        if not self.url:
+            raise ExtractorConfigError("EXTRACTOR_PUBLIC_JSON_URL is required for public_json_snapshot extractor.")
+        if not self.source_name:
+            raise ExtractorConfigError("EXTRACTOR_PUBLIC_SOURCE_NAME must not be empty.")
+
+        self.http_client = http_client or ResponsibleHttpClient(
+            user_agent=settings.EXTRACTOR_USER_AGENT,
+            timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS,
+            retries=settings.EXTRACTOR_HTTP_RETRIES,
+            retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS,
+            request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS,
+        )
+
+    def fetch(self) -> Any:
+        return self.http_client.get_json(self.url)
+
+    def parse(self, payload: Any) -> list[dict[str, Any]]:
+        if isinstance(payload, list):
+            return payload
+        if not isinstance(payload, dict):
+            raise ExtractorParseError("Fetched payload must be a JSON object or array.")
+
+        rows = payload.get("records")
+        if isinstance(rows, list):
+            return rows
+
+        data_rows = payload.get("data")
+        if isinstance(data_rows, list):
+            return data_rows
+
+        raise ExtractorParseError("Payload must contain 'records' or 'data' list.")
+
+    def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
+        normalized = {
+            "competition_external_id": _first_non_empty(
+                source_record, "competition_external_id", "competition_id", "league_id"
+            ),
+            "competition_name": _first_non_empty(
+                source_record, "competition_name", "competition", "league_name"
+            ),
+            "season": _first_non_empty(source_record, "season", "season_label", "season_name"),
+            "team_external_id": _first_non_empty(source_record, "team_external_id", "team_id"),
+            "team_name": _first_non_empty(source_record, "team_name", "team"),
+            "player_external_id": _first_non_empty(source_record, "player_external_id", "player_id"),
+            "full_name": _first_non_empty(source_record, "full_name", "player_name", "name"),
+            "first_name": _first_non_empty(source_record, "first_name"),
+            "last_name": _first_non_empty(source_record, "last_name"),
+            "birth_date": _first_non_empty(source_record, "birth_date"),
+            "nationality": _first_non_empty(source_record, "nationality", "nationality_code"),
+            "height_cm": _first_non_empty(source_record, "height_cm"),
+            "weight_kg": _first_non_empty(source_record, "weight_kg"),
+            "position": _first_non_empty(source_record, "position"),
+            "role": _first_non_empty(source_record, "role"),
+            "games_played": _first_non_empty(source_record, "games_played", "gp"),
+            "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"),
+            "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"),
+            "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"),
+            "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"),
+            "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"),
+            "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"),
+            "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"),
+            "fg_pct": _first_non_empty(source_record, "fg_pct"),
+            "three_pt_pct": _first_non_empty(
+                source_record, "three_pt_pct", "three_point_pct", "three_pct", "3p_pct"
+            ),
+            "ft_pct": _first_non_empty(source_record, "ft_pct"),
+        }
+
+        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
+        if missing:
+            raise ExtractorNormalizationError(
+                f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}"
+            )
+
+        normalized["season"] = str(normalized["season"]).strip()
+        normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip()
+        normalized["team_external_id"] = str(normalized["team_external_id"]).strip()
+        normalized["player_external_id"] = str(normalized["player_external_id"]).strip()
+
+        if self.include_raw_payload:
+            normalized["raw_payload"] = source_record
+
+        return normalized
--- a/apps/ingestion/extractors/registry.py
+++ b/apps/ingestion/extractors/registry.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from .base import BaseSnapshotExtractor, ExtractorConfigError
+from .public_json import PublicJsonSnapshotExtractor
+
+EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = {
+    PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor,
+}
+
+
+def available_extractors() -> list[str]:
+    return sorted(EXTRACTOR_REGISTRY.keys())
+
+
+def create_extractor(extractor_name: str) -> BaseSnapshotExtractor:
+    try:
+        extractor_cls = EXTRACTOR_REGISTRY[extractor_name]
+    except KeyError as exc:
+        raise ExtractorConfigError(
+            f"Unknown extractor '{extractor_name}'. Available: {', '.join(available_extractors())}"
+        ) from exc
+    return extractor_cls()
--- a/apps/ingestion/management/commands/run_extractor.py
+++ b/apps/ingestion/management/commands/run_extractor.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+from django.core.management.base import BaseCommand, CommandError
+from django.utils.dateparse import parse_date
+
+from apps.ingestion.extractors import ExtractorError, available_extractors, create_extractor
+
+
+class Command(BaseCommand):
+    help = "Run a snapshot extractor and emit importable JSON snapshots."
+
+    def add_arguments(self, parser):
+        parser.add_argument("extractor_name", choices=available_extractors())
+        parser.add_argument(
+            "--output-path",
+            dest="output_path",
+            default=None,
+            help="Directory or .json file path where snapshot should be written. Defaults to incoming dir.",
+        )
+        parser.add_argument(
+            "--snapshot-date",
+            dest="snapshot_date",
+            default=None,
+            help="Override snapshot date in YYYY-MM-DD format.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="Validate extraction/normalization without writing snapshot file.",
+        )
+        parser.add_argument(
+            "--indent",
+            type=int,
+            default=2,
+            help="JSON indent level for emitted snapshots.",
+        )
+
+    def handle(self, *args, **options):
+        snapshot_date = None
+        if options["snapshot_date"]:
+            snapshot_date = parse_date(options["snapshot_date"])
+            if snapshot_date is None:
+                raise CommandError("--snapshot-date must be YYYY-MM-DD.")
+
+        try:
+            extractor = create_extractor(options["extractor_name"])
+            result = extractor.run(
+                output_path=options["output_path"],
+                snapshot_date=snapshot_date,
+                write_output=not options["dry_run"],
+                indent=options["indent"],
+            )
+        except ExtractorError as exc:
+            raise CommandError(str(exc)) from exc
+
+        output_path = str(result.output_path) if result.output_path else "<dry-run>"
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"Extractor {result.extractor_name} completed: "
+                f"source={result.source_name} date={result.snapshot_date} "
+                f"records={result.records_count} output={output_path}"
+            )
+        )