feat(v2): add LBA snapshot extractor and command

2026-03-13 14:28:35 +01:00
parent 850e4de71b
commit 97913c4a79
9 changed files with 358 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -45,6 +45,10 @@ EXTRACTOR_REQUEST_DELAY_SECONDS=0.5
 EXTRACTOR_PUBLIC_JSON_URL=
 EXTRACTOR_PUBLIC_SOURCE_NAME=public_json_source
 EXTRACTOR_INCLUDE_RAW_PAYLOAD=0
 EXTRACTOR_LBA_STATS_URL=
 EXTRACTOR_LBA_SEASON_LABEL=2025-2026
 EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID=lba-serie-a
 EXTRACTOR_LBA_COMPETITION_NAME=Lega Basket Serie A
 # Future optional scheduler loop settings (not enabled in base v2 runtime)
 SCHEDULER_ENABLED=0
--- a/README.md
+++ b/README.md
@@ -184,6 +184,7 @@ Extractor pipeline:
 Built-in extractor in this phase:
 - `public_json_snapshot` (generic JSON feed extractor for MVP usage)
 - `lba` (Lega Basket Serie A MVP extractor)
 Run extractor:
@@ -203,6 +204,12 @@ Dry-run validation (no file write):
 docker compose exec web python manage.py run_extractor public_json_snapshot --dry-run
 ```
 Run only the LBA extractor:
 ```bash
 docker compose exec web python manage.py run_lba_extractor
 ```
 Extractor environment variables:
 - `EXTRACTOR_USER_AGENT`
 - `EXTRACTOR_HTTP_TIMEOUT_SECONDS`
@@ -212,12 +219,29 @@ Extractor environment variables:
 - `EXTRACTOR_PUBLIC_JSON_URL`
 - `EXTRACTOR_PUBLIC_SOURCE_NAME`
 - `EXTRACTOR_INCLUDE_RAW_PAYLOAD`
 - `EXTRACTOR_LBA_STATS_URL`
 - `EXTRACTOR_LBA_SEASON_LABEL`
 - `EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID`
 - `EXTRACTOR_LBA_COMPETITION_NAME`
 Notes:
 - extraction is intentionally low-frequency and uses retries conservatively
 - only public pages/endpoints should be targeted
 - emitted snapshots must match the same schema consumed by `import_snapshots`
 ### LBA extractor assumptions and limitations (MVP)
 - `source_name` is fixed to `lba`
 - the extractor expects one stable public JSON payload that includes player/team/stat rows
 - competition is configured by environment and emitted as:
  - `competition_external_id` from `EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID`
  - `competition_name` from `EXTRACTOR_LBA_COMPETITION_NAME`
 - season is configured by `EXTRACTOR_LBA_SEASON_LABEL`
 - parser supports payload keys: `records`, `data`, `players`, `items`
 - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`)
 - no BCL support in this task
 - no live HTTP calls in tests; tests use fixtures/mocked responses only
 ## Migration and Superuser Commands
 ```bash
--- a/apps/ingestion/extractors/init.py
+++ b/apps/ingestion/extractors/init.py
@@ -7,10 +7,12 @@ from .base import (
    ExtractorNormalizationError,
    ExtractorParseError,
 )
 from .lba import LBASnapshotExtractor
 from .registry import available_extractors, create_extractor
 __all__ = [
    "BaseSnapshotExtractor",
    "LBASnapshotExtractor",
    "ExtractionResult",
    "ExtractorError",
    "ExtractorConfigError",
--- a/apps/ingestion/extractors/lba.py
+++ b/apps/ingestion/extractors/lba.py
@@ -0,0 +1,137 @@
 from __future__ import annotations
 from typing import Any
 from django.conf import settings
 from .base import BaseSnapshotExtractor, ExtractorConfigError, ExtractorNormalizationError, ExtractorParseError
 from .http import ResponsibleHttpClient
 def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
    for key in keys:
        value = record.get(key)
        if value not in (None, ""):
            return value
    return None
 class LBASnapshotExtractor(BaseSnapshotExtractor):
    """
    LBA (Lega Basket Serie A) MVP extractor.
    Scope is intentionally conservative:
    - one configured public stats endpoint
    - one configured season label
    - normalized player-season rows only
    """
    extractor_name = "lba"
    source_name = "lba"
    def __init__(self, *, http_client: ResponsibleHttpClient | None = None):
        self.url = settings.EXTRACTOR_LBA_STATS_URL.strip()
        self.season_label = settings.EXTRACTOR_LBA_SEASON_LABEL.strip()
        self.competition_external_id = settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID.strip()
        self.competition_name = settings.EXTRACTOR_LBA_COMPETITION_NAME.strip()
        self.include_raw_payload = settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD
        self.http_client = http_client or ResponsibleHttpClient(
            user_agent=settings.EXTRACTOR_USER_AGENT,
            timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS,
            retries=settings.EXTRACTOR_HTTP_RETRIES,
            retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS,
            request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS,
        )
        if not self.url:
            raise ExtractorConfigError("EXTRACTOR_LBA_STATS_URL is required for lba extractor.")
        if not self.season_label:
            raise ExtractorConfigError("EXTRACTOR_LBA_SEASON_LABEL is required for lba extractor.")
        if not self.competition_external_id:
            raise ExtractorConfigError("EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID is required.")
        if not self.competition_name:
            raise ExtractorConfigError("EXTRACTOR_LBA_COMPETITION_NAME is required.")
    def fetch(self) -> Any:
        return self.http_client.get_json(self.url)
    def parse(self, payload: Any) -> list[dict[str, Any]]:
        if isinstance(payload, list):
            return payload
        if not isinstance(payload, dict):
            raise ExtractorParseError("LBA payload must be a JSON object or array.")
        for key in ("records", "data", "players", "items"):
            rows = payload.get(key)
            if isinstance(rows, list):
                return rows
        raise ExtractorParseError("LBA payload must contain one of: records, data, players, items.")
    def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
        player_obj = source_record.get("player") if isinstance(source_record.get("player"), dict) else {}
        team_obj = source_record.get("team") if isinstance(source_record.get("team"), dict) else {}
        full_name = _first_non_empty(
            source_record,
            "full_name",
            "player_name",
            "name",
        ) or _first_non_empty(player_obj, "full_name", "name")
        first_name = _first_non_empty(source_record, "first_name") or _first_non_empty(player_obj, "first_name")
        last_name = _first_non_empty(source_record, "last_name") or _first_non_empty(player_obj, "last_name")
        player_external_id = _first_non_empty(
            source_record, "player_external_id", "player_id", "athlete_id"
        ) or _first_non_empty(player_obj, "id", "player_id")
        team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty(
            team_obj, "id", "team_id"
        )
        team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name")
        normalized = {
            "competition_external_id": self.competition_external_id,
            "competition_name": self.competition_name,
            "season": self.season_label,
            "team_external_id": team_external_id,
            "team_name": team_name,
            "player_external_id": player_external_id,
            "full_name": full_name,
            "first_name": first_name,
            "last_name": last_name,
            "birth_date": _first_non_empty(source_record, "birth_date") or _first_non_empty(
                player_obj, "birth_date", "dob"
            ),
            "nationality": _first_non_empty(source_record, "nationality")
            or _first_non_empty(player_obj, "nationality", "country"),
            "height_cm": _first_non_empty(source_record, "height_cm") or _first_non_empty(player_obj, "height_cm"),
            "weight_kg": _first_non_empty(source_record, "weight_kg") or _first_non_empty(player_obj, "weight_kg"),
            "position": _first_non_empty(source_record, "position") or _first_non_empty(player_obj, "position"),
            "role": _first_non_empty(source_record, "role"),
            "games_played": _first_non_empty(source_record, "games_played", "gp"),
            "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"),
            "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"),
            "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"),
            "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"),
            "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"),
            "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"),
            "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"),
            "fg_pct": _first_non_empty(source_record, "fg_pct", "fg_percentage"),
            "three_pt_pct": _first_non_empty(
                source_record, "three_pt_pct", "three_point_pct", "3p_pct", "three_pct"
            ),
            "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"),
        }
        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
        if missing:
            raise ExtractorNormalizationError(f"lba row missing required fields: {', '.join(sorted(missing))}")
        normalized["team_external_id"] = str(normalized["team_external_id"]).strip()
        normalized["player_external_id"] = str(normalized["player_external_id"]).strip()
        normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip()
        normalized["season"] = str(normalized["season"]).strip()
        if self.include_raw_payload:
            normalized["raw_payload"] = source_record
        return normalized
--- a/apps/ingestion/extractors/registry.py
+++ b/apps/ingestion/extractors/registry.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 from .base import BaseSnapshotExtractor, ExtractorConfigError
 from .lba import LBASnapshotExtractor
 from .public_json import PublicJsonSnapshotExtractor
 EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = {
    LBASnapshotExtractor.extractor_name: LBASnapshotExtractor,
    PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor,
 }
--- a/apps/ingestion/management/commands/run_lba_extractor.py
+++ b/apps/ingestion/management/commands/run_lba_extractor.py
@@ -0,0 +1,61 @@
 from __future__ import annotations
 from django.core.management.base import BaseCommand, CommandError
 from django.utils.dateparse import parse_date
 from apps.ingestion.extractors import ExtractorError, create_extractor
 class Command(BaseCommand):
    help = "Run only the LBA extractor and emit an import-ready snapshot JSON."
    def add_arguments(self, parser):
        parser.add_argument(
            "--output-path",
            dest="output_path",
            default=None,
            help="Directory or .json path to write output (default incoming dir).",
        )
        parser.add_argument(
            "--snapshot-date",
            dest="snapshot_date",
            default=None,
            help="Override snapshot date in YYYY-MM-DD format.",
        )
        parser.add_argument(
            "--dry-run",
            action="store_true",
            help="Validate without writing output.",
        )
        parser.add_argument(
            "--indent",
            type=int,
            default=2,
            help="JSON indent level for emitted file.",
        )
    def handle(self, *args, **options):
        snapshot_date = None
        if options["snapshot_date"]:
            snapshot_date = parse_date(options["snapshot_date"])
            if snapshot_date is None:
                raise CommandError("--snapshot-date must be YYYY-MM-DD.")
        try:
            extractor = create_extractor("lba")
            result = extractor.run(
                output_path=options["output_path"],
                snapshot_date=snapshot_date,
                write_output=not options["dry_run"],
                indent=options["indent"],
            )
        except ExtractorError as exc:
            raise CommandError(str(exc)) from exc
        output = str(result.output_path) if result.output_path else "<dry-run>"
        self.stdout.write(
            self.style.SUCCESS(
                f"LBA extractor completed: source={result.source_name} "
                f"date={result.snapshot_date} records={result.records_count} output={output}"
            )
        )
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -165,6 +165,10 @@ EXTRACTOR_REQUEST_DELAY_SECONDS = float(os.getenv("EXTRACTOR_REQUEST_DELAY_SECON
 EXTRACTOR_PUBLIC_JSON_URL = os.getenv("EXTRACTOR_PUBLIC_JSON_URL", "").strip()
 EXTRACTOR_PUBLIC_SOURCE_NAME = os.getenv("EXTRACTOR_PUBLIC_SOURCE_NAME", "public_json_source").strip()
 EXTRACTOR_INCLUDE_RAW_PAYLOAD = env_bool("EXTRACTOR_INCLUDE_RAW_PAYLOAD", False)
 EXTRACTOR_LBA_STATS_URL = os.getenv("EXTRACTOR_LBA_STATS_URL", "").strip()
 EXTRACTOR_LBA_SEASON_LABEL = os.getenv("EXTRACTOR_LBA_SEASON_LABEL", "").strip()
 EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = os.getenv("EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID", "lba-serie-a").strip()
 EXTRACTOR_LBA_COMPETITION_NAME = os.getenv("EXTRACTOR_LBA_COMPETITION_NAME", "Lega Basket Serie A").strip()
 if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0:
    raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.")
--- a/tests/fixtures/lba/lba_players_stats.json
+++ b/tests/fixtures/lba/lba_players_stats.json
@@ -0,0 +1,32 @@
 {
  "data": [
    {
      "player": {
        "id": "p-001",
        "name": "Marco Rossi",
        "first_name": "Marco",
        "last_name": "Rossi",
        "birth_date": "2000-01-05",
        "nationality": "IT",
        "height_cm": 190,
        "weight_kg": 84,
        "position": "PG"
      },
      "team": {
        "id": "team-virtus-bologna",
        "name": "Virtus Bologna"
      },
      "gp": 20,
      "mpg": 28.3,
      "ppg": 15.8,
      "rpg": 3.4,
      "apg": 5.9,
      "spg": 1.4,
      "bpg": 0.2,
      "tov": 2.1,
      "fg_pct": 47.6,
      "three_pct": 36.5,
      "ft_pct": 84.2
    }
  ]
 }
--- a/tests/test_lba_extractor.py
+++ b/tests/test_lba_extractor.py
@@ -0,0 +1,92 @@
 from __future__ import annotations
 import json
 from datetime import date
 from pathlib import Path
 import pytest
 from django.core.management import call_command
 from apps.ingestion.extractors.lba import LBASnapshotExtractor
 from apps.ingestion.extractors.registry import create_extractor
 def _load_fixture(path: str) -> dict:
    fixture_path = Path(__file__).parent / "fixtures" / path
    return json.loads(fixture_path.read_text(encoding="utf-8"))
@pytest.mark.django_db
 def test_lba_extractor_normalizes_fixture_payload(tmp_path, settings):
    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
    settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a"
    settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A"
    fixture_payload = _load_fixture("lba/lba_players_stats.json")
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return fixture_payload
    extractor = LBASnapshotExtractor(http_client=FakeClient())
    output_path = tmp_path / "lba.json"
    result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13))
    assert result.extractor_name == "lba"
    assert result.source_name == "lba"
    assert result.records_count == 1
    payload = json.loads(output_path.read_text(encoding="utf-8"))
    assert payload["source_name"] == "lba"
    assert payload["snapshot_date"] == "2026-03-13"
    row = payload["records"][0]
    assert row["competition_external_id"] == "lba-serie-a"
    assert row["competition_name"] == "Lega Basket Serie A"
    assert row["team_external_id"] == "team-virtus-bologna"
    assert row["team_name"] == "Virtus Bologna"
    assert row["player_external_id"] == "p-001"
    assert row["full_name"] == "Marco Rossi"
    assert row["minutes_per_game"] == 28.3
    assert row["three_pt_pct"] == 36.5
@pytest.mark.django_db
 def test_lba_extractor_registry_selection(settings):
    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
    extractor = create_extractor("lba")
    assert isinstance(extractor, LBASnapshotExtractor)
@pytest.mark.django_db
 def test_run_lba_extractor_command_writes_snapshot(tmp_path, settings, monkeypatch):
    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
    settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a"
    settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A"
    fixture_payload = _load_fixture("lba/lba_players_stats.json")
    class FakeClient:
        def get_json(self, *_args, **_kwargs):
            return fixture_payload
    monkeypatch.setattr(
        "apps.ingestion.extractors.lba.ResponsibleHttpClient",
        lambda **_kwargs: FakeClient(),
    )
    call_command(
        "run_lba_extractor",
        "--output-path",
        str(tmp_path),
        "--snapshot-date",
        "2026-03-13",
    )
    files = list(tmp_path.glob("lba-2026-03-13.json"))
    assert len(files) == 1
    payload = json.loads(files[0].read_text(encoding="utf-8"))
    assert payload["source_name"] == "lba"
    assert len(payload["records"]) == 1