feat(v2): add BCL snapshot extractor and command

2026-03-13 14:32:21 +01:00
parent 97913c4a79
commit 5df973467d
9 changed files with 357 additions and 1 deletions
--- a/apps/ingestion/extractors/init.py
+++ b/apps/ingestion/extractors/init.py
@@ -1,3 +1,4 @@
+from .bcl import BCLSnapshotExtractor
 from .base import (
    BaseSnapshotExtractor,
    ExtractionResult,
@@ -12,6 +13,7 @@ from .registry import available_extractors, create_extractor

 __all__ = [
    "BaseSnapshotExtractor",
+    "BCLSnapshotExtractor",
    "LBASnapshotExtractor",
    "ExtractionResult",
    "ExtractorError",
--- a/apps/ingestion/extractors/bcl.py
+++ b/apps/ingestion/extractors/bcl.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+from typing import Any
+
+from django.conf import settings
+
+from .base import BaseSnapshotExtractor, ExtractorConfigError, ExtractorNormalizationError, ExtractorParseError
+from .http import ResponsibleHttpClient
+
+
+def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        value = record.get(key)
+        if value not in (None, ""):
+            return value
+    return None
+
+
+class BCLSnapshotExtractor(BaseSnapshotExtractor):
+    """
+    Basketball Champions League MVP extractor.
+
+    Scope is intentionally conservative:
+    - one configured public stats endpoint
+    - one configured season label
+    - normalized player-season rows only
+    """
+
+    extractor_name = "bcl"
+    source_name = "bcl"
+
+    def __init__(self, *, http_client: ResponsibleHttpClient | None = None):
+        self.url = settings.EXTRACTOR_BCL_STATS_URL.strip()
+        self.season_label = settings.EXTRACTOR_BCL_SEASON_LABEL.strip()
+        self.competition_external_id = settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID.strip()
+        self.competition_name = settings.EXTRACTOR_BCL_COMPETITION_NAME.strip()
+        self.include_raw_payload = settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD
+        self.http_client = http_client or ResponsibleHttpClient(
+            user_agent=settings.EXTRACTOR_USER_AGENT,
+            timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS,
+            retries=settings.EXTRACTOR_HTTP_RETRIES,
+            retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS,
+            request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS,
+        )
+
+        if not self.url:
+            raise ExtractorConfigError("EXTRACTOR_BCL_STATS_URL is required for bcl extractor.")
+        if not self.season_label:
+            raise ExtractorConfigError("EXTRACTOR_BCL_SEASON_LABEL is required for bcl extractor.")
+        if not self.competition_external_id:
+            raise ExtractorConfigError("EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID is required.")
+        if not self.competition_name:
+            raise ExtractorConfigError("EXTRACTOR_BCL_COMPETITION_NAME is required.")
+
+    def fetch(self) -> Any:
+        return self.http_client.get_json(self.url)
+
+    def parse(self, payload: Any) -> list[dict[str, Any]]:
+        if isinstance(payload, list):
+            return payload
+        if not isinstance(payload, dict):
+            raise ExtractorParseError("BCL payload must be a JSON object or array.")
+
+        for key in ("records", "data", "players", "items"):
+            rows = payload.get(key)
+            if isinstance(rows, list):
+                return rows
+
+        raise ExtractorParseError("BCL payload must contain one of: records, data, players, items.")
+
+    def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
+        player_obj = source_record.get("player") if isinstance(source_record.get("player"), dict) else {}
+        team_obj = source_record.get("team") if isinstance(source_record.get("team"), dict) else {}
+
+        full_name = _first_non_empty(
+            source_record,
+            "full_name",
+            "player_name",
+            "name",
+        ) or _first_non_empty(player_obj, "full_name", "name")
+        first_name = _first_non_empty(source_record, "first_name") or _first_non_empty(player_obj, "first_name")
+        last_name = _first_non_empty(source_record, "last_name") or _first_non_empty(player_obj, "last_name")
+        player_external_id = _first_non_empty(
+            source_record, "player_external_id", "player_id", "athlete_id"
+        ) or _first_non_empty(player_obj, "id", "player_id")
+        team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty(
+            team_obj, "id", "team_id"
+        )
+        team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name")
+
+        normalized = {
+            "competition_external_id": self.competition_external_id,
+            "competition_name": self.competition_name,
+            "season": self.season_label,
+            "team_external_id": team_external_id,
+            "team_name": team_name,
+            "player_external_id": player_external_id,
+            "full_name": full_name,
+            "first_name": first_name,
+            "last_name": last_name,
+            "birth_date": _first_non_empty(source_record, "birth_date") or _first_non_empty(
+                player_obj, "birth_date", "dob"
+            ),
+            "nationality": _first_non_empty(source_record, "nationality")
+            or _first_non_empty(player_obj, "nationality", "country"),
+            "height_cm": _first_non_empty(source_record, "height_cm") or _first_non_empty(player_obj, "height_cm"),
+            "weight_kg": _first_non_empty(source_record, "weight_kg") or _first_non_empty(player_obj, "weight_kg"),
+            "position": _first_non_empty(source_record, "position") or _first_non_empty(player_obj, "position"),
+            "role": _first_non_empty(source_record, "role"),
+            "games_played": _first_non_empty(source_record, "games_played", "gp"),
+            "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"),
+            "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"),
+            "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"),
+            "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"),
+            "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"),
+            "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"),
+            "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"),
+            "fg_pct": _first_non_empty(source_record, "fg_pct", "fg_percentage"),
+            "three_pt_pct": _first_non_empty(
+                source_record, "three_pt_pct", "three_point_pct", "3p_pct", "three_pct"
+            ),
+            "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"),
+        }
+
+        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
+        if missing:
+            raise ExtractorNormalizationError(f"bcl row missing required fields: {', '.join(sorted(missing))}")
+
+        normalized["team_external_id"] = str(normalized["team_external_id"]).strip()
+        normalized["player_external_id"] = str(normalized["player_external_id"]).strip()
+        normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip()
+        normalized["season"] = str(normalized["season"]).strip()
+
+        if self.include_raw_payload:
+            normalized["raw_payload"] = source_record
+
+        return normalized
--- a/apps/ingestion/extractors/registry.py
+++ b/apps/ingestion/extractors/registry.py
@@ -1,10 +1,12 @@
 from __future__ import annotations

+from .bcl import BCLSnapshotExtractor
 from .base import BaseSnapshotExtractor, ExtractorConfigError
 from .lba import LBASnapshotExtractor
 from .public_json import PublicJsonSnapshotExtractor

 EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = {
+    BCLSnapshotExtractor.extractor_name: BCLSnapshotExtractor,
    LBASnapshotExtractor.extractor_name: LBASnapshotExtractor,
    PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor,
 }
--- a/apps/ingestion/management/commands/run_bcl_extractor.py
+++ b/apps/ingestion/management/commands/run_bcl_extractor.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from django.core.management.base import BaseCommand, CommandError
+from django.utils.dateparse import parse_date
+
+from apps.ingestion.extractors import ExtractorError, create_extractor
+
+
+class Command(BaseCommand):
+    help = "Run only the BCL extractor and emit an import-ready snapshot JSON."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--output-path",
+            dest="output_path",
+            default=None,
+            help="Directory or .json path to write output (default incoming dir).",
+        )
+        parser.add_argument(
+            "--snapshot-date",
+            dest="snapshot_date",
+            default=None,
+            help="Override snapshot date in YYYY-MM-DD format.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="Validate without writing output.",
+        )
+        parser.add_argument(
+            "--indent",
+            type=int,
+            default=2,
+            help="JSON indent level for emitted file.",
+        )
+
+    def handle(self, *args, **options):
+        snapshot_date = None
+        if options["snapshot_date"]:
+            snapshot_date = parse_date(options["snapshot_date"])
+            if snapshot_date is None:
+                raise CommandError("--snapshot-date must be YYYY-MM-DD.")
+
+        try:
+            extractor = create_extractor("bcl")
+            result = extractor.run(
+                output_path=options["output_path"],
+                snapshot_date=snapshot_date,
+                write_output=not options["dry_run"],
+                indent=options["indent"],
+            )
+        except ExtractorError as exc:
+            raise CommandError(str(exc)) from exc
+
+        output = str(result.output_path) if result.output_path else "<dry-run>"
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"BCL extractor completed: source={result.source_name} "
+                f"date={result.snapshot_date} records={result.records_count} output={output}"
+            )
+        )