From 97913c4a79c1ca5fd37d67e11790535c0e0ae823 Mon Sep 17 00:00:00 2001
From: Alfredo Di Stasio <alfredo.distasio@unina.it>
Date: Fri, 13 Mar 2026 14:28:35 +0100
Subject: [PATCH] feat(v2): add LBA snapshot extractor and command

---
 .env.example                                  |   4 +
 README.md                                     |  24 +++
 apps/ingestion/extractors/__init__.py         |   2 +
 apps/ingestion/extractors/lba.py              | 137 ++++++++++++++++++
 apps/ingestion/extractors/registry.py         |   2 +
 .../management/commands/run_lba_extractor.py  |  61 ++++++++
 config/settings/base.py                       |   4 +
 tests/fixtures/lba/lba_players_stats.json     |  32 ++++
 tests/test_lba_extractor.py                   |  92 ++++++++++++
 9 files changed, 358 insertions(+)
 create mode 100644 apps/ingestion/extractors/lba.py
 create mode 100644 apps/ingestion/management/commands/run_lba_extractor.py
 create mode 100644 tests/fixtures/lba/lba_players_stats.json
 create mode 100644 tests/test_lba_extractor.py

diff --git a/.env.example b/.env.example
index be47342..cbe118b 100644
--- a/.env.example
+++ b/.env.example
@@ -45,6 +45,10 @@ EXTRACTOR_REQUEST_DELAY_SECONDS=0.5
 EXTRACTOR_PUBLIC_JSON_URL=
 EXTRACTOR_PUBLIC_SOURCE_NAME=public_json_source
 EXTRACTOR_INCLUDE_RAW_PAYLOAD=0
+EXTRACTOR_LBA_STATS_URL=
+EXTRACTOR_LBA_SEASON_LABEL=2025-2026
+EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID=lba-serie-a
+EXTRACTOR_LBA_COMPETITION_NAME=Lega Basket Serie A
 
 # Future optional scheduler loop settings (not enabled in base v2 runtime)
 SCHEDULER_ENABLED=0
diff --git a/README.md b/README.md
index 7e6e1bb..e1814a6 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,7 @@ Extractor pipeline:
 
 Built-in extractor in this phase:
 - `public_json_snapshot` (generic JSON feed extractor for MVP usage)
+- `lba` (Lega Basket Serie A MVP extractor)
 
 Run extractor:
 
@@ -203,6 +204,12 @@ Dry-run validation (no file write):
 docker compose exec web python manage.py run_extractor public_json_snapshot --dry-run
 ```
 
+Run only the LBA extractor:
+
+```bash
+docker compose exec web python manage.py run_lba_extractor
+```
+
 Extractor environment variables:
 - `EXTRACTOR_USER_AGENT`
 - `EXTRACTOR_HTTP_TIMEOUT_SECONDS`
@@ -212,12 +219,29 @@ Extractor environment variables:
 - `EXTRACTOR_PUBLIC_JSON_URL`
 - `EXTRACTOR_PUBLIC_SOURCE_NAME`
 - `EXTRACTOR_INCLUDE_RAW_PAYLOAD`
+- `EXTRACTOR_LBA_STATS_URL`
+- `EXTRACTOR_LBA_SEASON_LABEL`
+- `EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID`
+- `EXTRACTOR_LBA_COMPETITION_NAME`
 
 Notes:
 - extraction is intentionally low-frequency and uses retries conservatively
 - only public pages/endpoints should be targeted
 - emitted snapshots must match the same schema consumed by `import_snapshots`
 
+### LBA extractor assumptions and limitations (MVP)
+
+- `source_name` is fixed to `lba`
+- the extractor expects one stable public JSON payload that includes player/team/stat rows
+- competition is configured by environment and emitted as:
+  - `competition_external_id` from `EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID`
+  - `competition_name` from `EXTRACTOR_LBA_COMPETITION_NAME`
+- season is configured by `EXTRACTOR_LBA_SEASON_LABEL`
+- parser supports payload keys: `records`, `data`, `players`, `items`
+- normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`)
+- no BCL support in this task
+- no live HTTP calls in tests; tests use fixtures/mocked responses only
+
 ## Migration and Superuser Commands
 
 ```bash
diff --git a/apps/ingestion/extractors/__init__.py b/apps/ingestion/extractors/__init__.py
index d459cd8..b58d36a 100644
--- a/apps/ingestion/extractors/__init__.py
+++ b/apps/ingestion/extractors/__init__.py
@@ -7,10 +7,12 @@ from .base import (
     ExtractorNormalizationError,
     ExtractorParseError,
 )
+from .lba import LBASnapshotExtractor
 from .registry import available_extractors, create_extractor
 
 __all__ = [
     "BaseSnapshotExtractor",
+    "LBASnapshotExtractor",
     "ExtractionResult",
     "ExtractorError",
     "ExtractorConfigError",
diff --git a/apps/ingestion/extractors/lba.py b/apps/ingestion/extractors/lba.py
new file mode 100644
index 0000000..d2536b0
--- /dev/null
+++ b/apps/ingestion/extractors/lba.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+from typing import Any
+
+from django.conf import settings
+
+from .base import BaseSnapshotExtractor, ExtractorConfigError, ExtractorNormalizationError, ExtractorParseError
+from .http import ResponsibleHttpClient
+
+
+def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        value = record.get(key)
+        if value not in (None, ""):
+            return value
+    return None
+
+
+class LBASnapshotExtractor(BaseSnapshotExtractor):
+    """
+    LBA (Lega Basket Serie A) MVP extractor.
+
+    Scope is intentionally conservative:
+    - one configured public stats endpoint
+    - one configured season label
+    - normalized player-season rows only
+    """
+
+    extractor_name = "lba"
+    source_name = "lba"
+
+    def __init__(self, *, http_client: ResponsibleHttpClient | None = None):
+        self.url = settings.EXTRACTOR_LBA_STATS_URL.strip()
+        self.season_label = settings.EXTRACTOR_LBA_SEASON_LABEL.strip()
+        self.competition_external_id = settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID.strip()
+        self.competition_name = settings.EXTRACTOR_LBA_COMPETITION_NAME.strip()
+        self.include_raw_payload = settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD
+        self.http_client = http_client or ResponsibleHttpClient(
+            user_agent=settings.EXTRACTOR_USER_AGENT,
+            timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS,
+            retries=settings.EXTRACTOR_HTTP_RETRIES,
+            retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS,
+            request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS,
+        )
+
+        if not self.url:
+            raise ExtractorConfigError("EXTRACTOR_LBA_STATS_URL is required for lba extractor.")
+        if not self.season_label:
+            raise ExtractorConfigError("EXTRACTOR_LBA_SEASON_LABEL is required for lba extractor.")
+        if not self.competition_external_id:
+            raise ExtractorConfigError("EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID is required.")
+        if not self.competition_name:
+            raise ExtractorConfigError("EXTRACTOR_LBA_COMPETITION_NAME is required.")
+
+    def fetch(self) -> Any:
+        return self.http_client.get_json(self.url)
+
+    def parse(self, payload: Any) -> list[dict[str, Any]]:
+        if isinstance(payload, list):
+            return payload
+        if not isinstance(payload, dict):
+            raise ExtractorParseError("LBA payload must be a JSON object or array.")
+
+        for key in ("records", "data", "players", "items"):
+            rows = payload.get(key)
+            if isinstance(rows, list):
+                return rows
+
+        raise ExtractorParseError("LBA payload must contain one of: records, data, players, items.")
+
+    def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
+        player_obj = source_record.get("player") if isinstance(source_record.get("player"), dict) else {}
+        team_obj = source_record.get("team") if isinstance(source_record.get("team"), dict) else {}
+
+        full_name = _first_non_empty(
+            source_record,
+            "full_name",
+            "player_name",
+            "name",
+        ) or _first_non_empty(player_obj, "full_name", "name")
+        first_name = _first_non_empty(source_record, "first_name") or _first_non_empty(player_obj, "first_name")
+        last_name = _first_non_empty(source_record, "last_name") or _first_non_empty(player_obj, "last_name")
+        player_external_id = _first_non_empty(
+            source_record, "player_external_id", "player_id", "athlete_id"
+        ) or _first_non_empty(player_obj, "id", "player_id")
+        team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty(
+            team_obj, "id", "team_id"
+        )
+        team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name")
+
+        normalized = {
+            "competition_external_id": self.competition_external_id,
+            "competition_name": self.competition_name,
+            "season": self.season_label,
+            "team_external_id": team_external_id,
+            "team_name": team_name,
+            "player_external_id": player_external_id,
+            "full_name": full_name,
+            "first_name": first_name,
+            "last_name": last_name,
+            "birth_date": _first_non_empty(source_record, "birth_date") or _first_non_empty(
+                player_obj, "birth_date", "dob"
+            ),
+            "nationality": _first_non_empty(source_record, "nationality")
+            or _first_non_empty(player_obj, "nationality", "country"),
+            "height_cm": _first_non_empty(source_record, "height_cm") or _first_non_empty(player_obj, "height_cm"),
+            "weight_kg": _first_non_empty(source_record, "weight_kg") or _first_non_empty(player_obj, "weight_kg"),
+            "position": _first_non_empty(source_record, "position") or _first_non_empty(player_obj, "position"),
+            "role": _first_non_empty(source_record, "role"),
+            "games_played": _first_non_empty(source_record, "games_played", "gp"),
+            "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"),
+            "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"),
+            "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"),
+            "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"),
+            "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"),
+            "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"),
+            "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"),
+            "fg_pct": _first_non_empty(source_record, "fg_pct", "fg_percentage"),
+            "three_pt_pct": _first_non_empty(
+                source_record, "three_pt_pct", "three_point_pct", "3p_pct", "three_pct"
+            ),
+            "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"),
+        }
+
+        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
+        if missing:
+            raise ExtractorNormalizationError(f"lba row missing required fields: {', '.join(sorted(missing))}")
+
+        normalized["team_external_id"] = str(normalized["team_external_id"]).strip()
+        normalized["player_external_id"] = str(normalized["player_external_id"]).strip()
+        normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip()
+        normalized["season"] = str(normalized["season"]).strip()
+
+        if self.include_raw_payload:
+            normalized["raw_payload"] = source_record
+
+        return normalized
diff --git a/apps/ingestion/extractors/registry.py b/apps/ingestion/extractors/registry.py
index bd960dc..19e64aa 100644
--- a/apps/ingestion/extractors/registry.py
+++ b/apps/ingestion/extractors/registry.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
 from .base import BaseSnapshotExtractor, ExtractorConfigError
+from .lba import LBASnapshotExtractor
 from .public_json import PublicJsonSnapshotExtractor
 
 EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = {
+    LBASnapshotExtractor.extractor_name: LBASnapshotExtractor,
     PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor,
 }
 
diff --git a/apps/ingestion/management/commands/run_lba_extractor.py b/apps/ingestion/management/commands/run_lba_extractor.py
new file mode 100644
index 0000000..6eb7b39
--- /dev/null
+++ b/apps/ingestion/management/commands/run_lba_extractor.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from django.core.management.base import BaseCommand, CommandError
+from django.utils.dateparse import parse_date
+
+from apps.ingestion.extractors import ExtractorError, create_extractor
+
+
+class Command(BaseCommand):
+    help = "Run only the LBA extractor and emit an import-ready snapshot JSON."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--output-path",
+            dest="output_path",
+            default=None,
+            help="Directory or .json path to write output (default incoming dir).",
+        )
+        parser.add_argument(
+            "--snapshot-date",
+            dest="snapshot_date",
+            default=None,
+            help="Override snapshot date in YYYY-MM-DD format.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="Validate without writing output.",
+        )
+        parser.add_argument(
+            "--indent",
+            type=int,
+            default=2,
+            help="JSON indent level for emitted file.",
+        )
+
+    def handle(self, *args, **options):
+        snapshot_date = None
+        if options["snapshot_date"]:
+            snapshot_date = parse_date(options["snapshot_date"])
+            if snapshot_date is None:
+                raise CommandError("--snapshot-date must be YYYY-MM-DD.")
+
+        try:
+            extractor = create_extractor("lba")
+            result = extractor.run(
+                output_path=options["output_path"],
+                snapshot_date=snapshot_date,
+                write_output=not options["dry_run"],
+                indent=options["indent"],
+            )
+        except ExtractorError as exc:
+            raise CommandError(str(exc)) from exc
+
+        output = str(result.output_path) if result.output_path else "<dry-run>"
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"LBA extractor completed: source={result.source_name} "
+                f"date={result.snapshot_date} records={result.records_count} output={output}"
+            )
+        )
diff --git a/config/settings/base.py b/config/settings/base.py
index 52015f0..1f9b410 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -165,6 +165,10 @@ EXTRACTOR_REQUEST_DELAY_SECONDS = float(os.getenv("EXTRACTOR_REQUEST_DELAY_SECON
 EXTRACTOR_PUBLIC_JSON_URL = os.getenv("EXTRACTOR_PUBLIC_JSON_URL", "").strip()
 EXTRACTOR_PUBLIC_SOURCE_NAME = os.getenv("EXTRACTOR_PUBLIC_SOURCE_NAME", "public_json_source").strip()
 EXTRACTOR_INCLUDE_RAW_PAYLOAD = env_bool("EXTRACTOR_INCLUDE_RAW_PAYLOAD", False)
+EXTRACTOR_LBA_STATS_URL = os.getenv("EXTRACTOR_LBA_STATS_URL", "").strip()
+EXTRACTOR_LBA_SEASON_LABEL = os.getenv("EXTRACTOR_LBA_SEASON_LABEL", "").strip()
+EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = os.getenv("EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID", "lba-serie-a").strip()
+EXTRACTOR_LBA_COMPETITION_NAME = os.getenv("EXTRACTOR_LBA_COMPETITION_NAME", "Lega Basket Serie A").strip()
 
 if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0:
     raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.")
diff --git a/tests/fixtures/lba/lba_players_stats.json b/tests/fixtures/lba/lba_players_stats.json
new file mode 100644
index 0000000..f07add2
--- /dev/null
+++ b/tests/fixtures/lba/lba_players_stats.json
@@ -0,0 +1,32 @@
+{
+  "data": [
+    {
+      "player": {
+        "id": "p-001",
+        "name": "Marco Rossi",
+        "first_name": "Marco",
+        "last_name": "Rossi",
+        "birth_date": "2000-01-05",
+        "nationality": "IT",
+        "height_cm": 190,
+        "weight_kg": 84,
+        "position": "PG"
+      },
+      "team": {
+        "id": "team-virtus-bologna",
+        "name": "Virtus Bologna"
+      },
+      "gp": 20,
+      "mpg": 28.3,
+      "ppg": 15.8,
+      "rpg": 3.4,
+      "apg": 5.9,
+      "spg": 1.4,
+      "bpg": 0.2,
+      "tov": 2.1,
+      "fg_pct": 47.6,
+      "three_pct": 36.5,
+      "ft_pct": 84.2
+    }
+  ]
+}
diff --git a/tests/test_lba_extractor.py b/tests/test_lba_extractor.py
new file mode 100644
index 0000000..aadc278
--- /dev/null
+++ b/tests/test_lba_extractor.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import json
+from datetime import date
+from pathlib import Path
+
+import pytest
+from django.core.management import call_command
+
+from apps.ingestion.extractors.lba import LBASnapshotExtractor
+from apps.ingestion.extractors.registry import create_extractor
+
+
+def _load_fixture(path: str) -> dict:
+    fixture_path = Path(__file__).parent / "fixtures" / path
+    return json.loads(fixture_path.read_text(encoding="utf-8"))
+
+
+@pytest.mark.django_db
+def test_lba_extractor_normalizes_fixture_payload(tmp_path, settings):
+    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
+    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
+    settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a"
+    settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A"
+
+    fixture_payload = _load_fixture("lba/lba_players_stats.json")
+
+    class FakeClient:
+        def get_json(self, *_args, **_kwargs):
+            return fixture_payload
+
+    extractor = LBASnapshotExtractor(http_client=FakeClient())
+    output_path = tmp_path / "lba.json"
+    result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13))
+
+    assert result.extractor_name == "lba"
+    assert result.source_name == "lba"
+    assert result.records_count == 1
+
+    payload = json.loads(output_path.read_text(encoding="utf-8"))
+    assert payload["source_name"] == "lba"
+    assert payload["snapshot_date"] == "2026-03-13"
+    row = payload["records"][0]
+    assert row["competition_external_id"] == "lba-serie-a"
+    assert row["competition_name"] == "Lega Basket Serie A"
+    assert row["team_external_id"] == "team-virtus-bologna"
+    assert row["team_name"] == "Virtus Bologna"
+    assert row["player_external_id"] == "p-001"
+    assert row["full_name"] == "Marco Rossi"
+    assert row["minutes_per_game"] == 28.3
+    assert row["three_pt_pct"] == 36.5
+
+
+@pytest.mark.django_db
+def test_lba_extractor_registry_selection(settings):
+    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
+    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
+    extractor = create_extractor("lba")
+    assert isinstance(extractor, LBASnapshotExtractor)
+
+
+@pytest.mark.django_db
+def test_run_lba_extractor_command_writes_snapshot(tmp_path, settings, monkeypatch):
+    settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json"
+    settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026"
+    settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a"
+    settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A"
+
+    fixture_payload = _load_fixture("lba/lba_players_stats.json")
+
+    class FakeClient:
+        def get_json(self, *_args, **_kwargs):
+            return fixture_payload
+
+    monkeypatch.setattr(
+        "apps.ingestion.extractors.lba.ResponsibleHttpClient",
+        lambda **_kwargs: FakeClient(),
+    )
+
+    call_command(
+        "run_lba_extractor",
+        "--output-path",
+        str(tmp_path),
+        "--snapshot-date",
+        "2026-03-13",
+    )
+
+    files = list(tmp_path.glob("lba-2026-03-13.json"))
+    assert len(files) == 1
+    payload = json.loads(files[0].read_text(encoding="utf-8"))
+    assert payload["source_name"] == "lba"
+    assert len(payload["records"]) == 1