From 97913c4a79c1ca5fd37d67e11790535c0e0ae823 Mon Sep 17 00:00:00 2001 From: Alfredo Di Stasio Date: Fri, 13 Mar 2026 14:28:35 +0100 Subject: [PATCH] feat(v2): add LBA snapshot extractor and command --- .env.example | 4 + README.md | 24 +++ apps/ingestion/extractors/__init__.py | 2 + apps/ingestion/extractors/lba.py | 137 ++++++++++++++++++ apps/ingestion/extractors/registry.py | 2 + .../management/commands/run_lba_extractor.py | 61 ++++++++ config/settings/base.py | 4 + tests/fixtures/lba/lba_players_stats.json | 32 ++++ tests/test_lba_extractor.py | 92 ++++++++++++ 9 files changed, 358 insertions(+) create mode 100644 apps/ingestion/extractors/lba.py create mode 100644 apps/ingestion/management/commands/run_lba_extractor.py create mode 100644 tests/fixtures/lba/lba_players_stats.json create mode 100644 tests/test_lba_extractor.py diff --git a/.env.example b/.env.example index be47342..cbe118b 100644 --- a/.env.example +++ b/.env.example @@ -45,6 +45,10 @@ EXTRACTOR_REQUEST_DELAY_SECONDS=0.5 EXTRACTOR_PUBLIC_JSON_URL= EXTRACTOR_PUBLIC_SOURCE_NAME=public_json_source EXTRACTOR_INCLUDE_RAW_PAYLOAD=0 +EXTRACTOR_LBA_STATS_URL= +EXTRACTOR_LBA_SEASON_LABEL=2025-2026 +EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID=lba-serie-a +EXTRACTOR_LBA_COMPETITION_NAME=Lega Basket Serie A # Future optional scheduler loop settings (not enabled in base v2 runtime) SCHEDULER_ENABLED=0 diff --git a/README.md b/README.md index 7e6e1bb..e1814a6 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,7 @@ Extractor pipeline: Built-in extractor in this phase: - `public_json_snapshot` (generic JSON feed extractor for MVP usage) +- `lba` (Lega Basket Serie A MVP extractor) Run extractor: @@ -203,6 +204,12 @@ Dry-run validation (no file write): docker compose exec web python manage.py run_extractor public_json_snapshot --dry-run ``` +Run only the LBA extractor: + +```bash +docker compose exec web python manage.py run_lba_extractor +``` + Extractor environment variables: - `EXTRACTOR_USER_AGENT` - `EXTRACTOR_HTTP_TIMEOUT_SECONDS` @@ -212,12 +219,29 @@ Extractor environment variables: - `EXTRACTOR_PUBLIC_JSON_URL` - `EXTRACTOR_PUBLIC_SOURCE_NAME` - `EXTRACTOR_INCLUDE_RAW_PAYLOAD` +- `EXTRACTOR_LBA_STATS_URL` +- `EXTRACTOR_LBA_SEASON_LABEL` +- `EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID` +- `EXTRACTOR_LBA_COMPETITION_NAME` Notes: - extraction is intentionally low-frequency and uses retries conservatively - only public pages/endpoints should be targeted - emitted snapshots must match the same schema consumed by `import_snapshots` +### LBA extractor assumptions and limitations (MVP) + +- `source_name` is fixed to `lba` +- the extractor expects one stable public JSON payload that includes player/team/stat rows +- competition is configured by environment and emitted as: + - `competition_external_id` from `EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID` + - `competition_name` from `EXTRACTOR_LBA_COMPETITION_NAME` +- season is configured by `EXTRACTOR_LBA_SEASON_LABEL` +- parser supports payload keys: `records`, `data`, `players`, `items` +- normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`) +- no BCL support in this task +- no live HTTP calls in tests; tests use fixtures/mocked responses only + ## Migration and Superuser Commands ```bash diff --git a/apps/ingestion/extractors/__init__.py b/apps/ingestion/extractors/__init__.py index d459cd8..b58d36a 100644 --- a/apps/ingestion/extractors/__init__.py +++ b/apps/ingestion/extractors/__init__.py @@ -7,10 +7,12 @@ from .base import ( ExtractorNormalizationError, ExtractorParseError, ) +from .lba import LBASnapshotExtractor from .registry import available_extractors, create_extractor __all__ = [ "BaseSnapshotExtractor", + "LBASnapshotExtractor", "ExtractionResult", "ExtractorError", "ExtractorConfigError", diff --git a/apps/ingestion/extractors/lba.py b/apps/ingestion/extractors/lba.py new file mode 100644 index 0000000..d2536b0 --- /dev/null +++ b/apps/ingestion/extractors/lba.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +from typing import Any + +from django.conf import settings + +from .base import BaseSnapshotExtractor, ExtractorConfigError, ExtractorNormalizationError, ExtractorParseError +from .http import ResponsibleHttpClient + + +def _first_non_empty(record: dict[str, Any], *keys: str) -> Any: + for key in keys: + value = record.get(key) + if value not in (None, ""): + return value + return None + + +class LBASnapshotExtractor(BaseSnapshotExtractor): + """ + LBA (Lega Basket Serie A) MVP extractor. + + Scope is intentionally conservative: + - one configured public stats endpoint + - one configured season label + - normalized player-season rows only + """ + + extractor_name = "lba" + source_name = "lba" + + def __init__(self, *, http_client: ResponsibleHttpClient | None = None): + self.url = settings.EXTRACTOR_LBA_STATS_URL.strip() + self.season_label = settings.EXTRACTOR_LBA_SEASON_LABEL.strip() + self.competition_external_id = settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID.strip() + self.competition_name = settings.EXTRACTOR_LBA_COMPETITION_NAME.strip() + self.include_raw_payload = settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD + self.http_client = http_client or ResponsibleHttpClient( + user_agent=settings.EXTRACTOR_USER_AGENT, + timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS, + retries=settings.EXTRACTOR_HTTP_RETRIES, + retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS, + request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS, + ) + + if not self.url: + raise ExtractorConfigError("EXTRACTOR_LBA_STATS_URL is required for lba extractor.") + if not self.season_label: + raise ExtractorConfigError("EXTRACTOR_LBA_SEASON_LABEL is required for lba extractor.") + if not self.competition_external_id: + raise ExtractorConfigError("EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID is required.") + if not self.competition_name: + raise ExtractorConfigError("EXTRACTOR_LBA_COMPETITION_NAME is required.") + + def fetch(self) -> Any: + return self.http_client.get_json(self.url) + + def parse(self, payload: Any) -> list[dict[str, Any]]: + if isinstance(payload, list): + return payload + if not isinstance(payload, dict): + raise ExtractorParseError("LBA payload must be a JSON object or array.") + + for key in ("records", "data", "players", "items"): + rows = payload.get(key) + if isinstance(rows, list): + return rows + + raise ExtractorParseError("LBA payload must contain one of: records, data, players, items.") + + def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]: + player_obj = source_record.get("player") if isinstance(source_record.get("player"), dict) else {} + team_obj = source_record.get("team") if isinstance(source_record.get("team"), dict) else {} + + full_name = _first_non_empty( + source_record, + "full_name", + "player_name", + "name", + ) or _first_non_empty(player_obj, "full_name", "name") + first_name = _first_non_empty(source_record, "first_name") or _first_non_empty(player_obj, "first_name") + last_name = _first_non_empty(source_record, "last_name") or _first_non_empty(player_obj, "last_name") + player_external_id = _first_non_empty( + source_record, "player_external_id", "player_id", "athlete_id" + ) or _first_non_empty(player_obj, "id", "player_id") + team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty( + team_obj, "id", "team_id" + ) + team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name") + + normalized = { + "competition_external_id": self.competition_external_id, + "competition_name": self.competition_name, + "season": self.season_label, + "team_external_id": team_external_id, + "team_name": team_name, + "player_external_id": player_external_id, + "full_name": full_name, + "first_name": first_name, + "last_name": last_name, + "birth_date": _first_non_empty(source_record, "birth_date") or _first_non_empty( + player_obj, "birth_date", "dob" + ), + "nationality": _first_non_empty(source_record, "nationality") + or _first_non_empty(player_obj, "nationality", "country"), + "height_cm": _first_non_empty(source_record, "height_cm") or _first_non_empty(player_obj, "height_cm"), + "weight_kg": _first_non_empty(source_record, "weight_kg") or _first_non_empty(player_obj, "weight_kg"), + "position": _first_non_empty(source_record, "position") or _first_non_empty(player_obj, "position"), + "role": _first_non_empty(source_record, "role"), + "games_played": _first_non_empty(source_record, "games_played", "gp"), + "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"), + "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"), + "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"), + "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"), + "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"), + "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"), + "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"), + "fg_pct": _first_non_empty(source_record, "fg_pct", "fg_percentage"), + "three_pt_pct": _first_non_empty( + source_record, "three_pt_pct", "three_point_pct", "3p_pct", "three_pct" + ), + "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"), + } + + missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + if missing: + raise ExtractorNormalizationError(f"lba row missing required fields: {', '.join(sorted(missing))}") + + normalized["team_external_id"] = str(normalized["team_external_id"]).strip() + normalized["player_external_id"] = str(normalized["player_external_id"]).strip() + normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip() + normalized["season"] = str(normalized["season"]).strip() + + if self.include_raw_payload: + normalized["raw_payload"] = source_record + + return normalized diff --git a/apps/ingestion/extractors/registry.py b/apps/ingestion/extractors/registry.py index bd960dc..19e64aa 100644 --- a/apps/ingestion/extractors/registry.py +++ b/apps/ingestion/extractors/registry.py @@ -1,9 +1,11 @@ from __future__ import annotations from .base import BaseSnapshotExtractor, ExtractorConfigError +from .lba import LBASnapshotExtractor from .public_json import PublicJsonSnapshotExtractor EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = { + LBASnapshotExtractor.extractor_name: LBASnapshotExtractor, PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor, } diff --git a/apps/ingestion/management/commands/run_lba_extractor.py b/apps/ingestion/management/commands/run_lba_extractor.py new file mode 100644 index 0000000..6eb7b39 --- /dev/null +++ b/apps/ingestion/management/commands/run_lba_extractor.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from django.core.management.base import BaseCommand, CommandError +from django.utils.dateparse import parse_date + +from apps.ingestion.extractors import ExtractorError, create_extractor + + +class Command(BaseCommand): + help = "Run only the LBA extractor and emit an import-ready snapshot JSON." + + def add_arguments(self, parser): + parser.add_argument( + "--output-path", + dest="output_path", + default=None, + help="Directory or .json path to write output (default incoming dir).", + ) + parser.add_argument( + "--snapshot-date", + dest="snapshot_date", + default=None, + help="Override snapshot date in YYYY-MM-DD format.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Validate without writing output.", + ) + parser.add_argument( + "--indent", + type=int, + default=2, + help="JSON indent level for emitted file.", + ) + + def handle(self, *args, **options): + snapshot_date = None + if options["snapshot_date"]: + snapshot_date = parse_date(options["snapshot_date"]) + if snapshot_date is None: + raise CommandError("--snapshot-date must be YYYY-MM-DD.") + + try: + extractor = create_extractor("lba") + result = extractor.run( + output_path=options["output_path"], + snapshot_date=snapshot_date, + write_output=not options["dry_run"], + indent=options["indent"], + ) + except ExtractorError as exc: + raise CommandError(str(exc)) from exc + + output = str(result.output_path) if result.output_path else "" + self.stdout.write( + self.style.SUCCESS( + f"LBA extractor completed: source={result.source_name} " + f"date={result.snapshot_date} records={result.records_count} output={output}" + ) + ) diff --git a/config/settings/base.py b/config/settings/base.py index 52015f0..1f9b410 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -165,6 +165,10 @@ EXTRACTOR_REQUEST_DELAY_SECONDS = float(os.getenv("EXTRACTOR_REQUEST_DELAY_SECON EXTRACTOR_PUBLIC_JSON_URL = os.getenv("EXTRACTOR_PUBLIC_JSON_URL", "").strip() EXTRACTOR_PUBLIC_SOURCE_NAME = os.getenv("EXTRACTOR_PUBLIC_SOURCE_NAME", "public_json_source").strip() EXTRACTOR_INCLUDE_RAW_PAYLOAD = env_bool("EXTRACTOR_INCLUDE_RAW_PAYLOAD", False) +EXTRACTOR_LBA_STATS_URL = os.getenv("EXTRACTOR_LBA_STATS_URL", "").strip() +EXTRACTOR_LBA_SEASON_LABEL = os.getenv("EXTRACTOR_LBA_SEASON_LABEL", "").strip() +EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = os.getenv("EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID", "lba-serie-a").strip() +EXTRACTOR_LBA_COMPETITION_NAME = os.getenv("EXTRACTOR_LBA_COMPETITION_NAME", "Lega Basket Serie A").strip() if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0: raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.") diff --git a/tests/fixtures/lba/lba_players_stats.json b/tests/fixtures/lba/lba_players_stats.json new file mode 100644 index 0000000..f07add2 --- /dev/null +++ b/tests/fixtures/lba/lba_players_stats.json @@ -0,0 +1,32 @@ +{ + "data": [ + { + "player": { + "id": "p-001", + "name": "Marco Rossi", + "first_name": "Marco", + "last_name": "Rossi", + "birth_date": "2000-01-05", + "nationality": "IT", + "height_cm": 190, + "weight_kg": 84, + "position": "PG" + }, + "team": { + "id": "team-virtus-bologna", + "name": "Virtus Bologna" + }, + "gp": 20, + "mpg": 28.3, + "ppg": 15.8, + "rpg": 3.4, + "apg": 5.9, + "spg": 1.4, + "bpg": 0.2, + "tov": 2.1, + "fg_pct": 47.6, + "three_pct": 36.5, + "ft_pct": 84.2 + } + ] +} diff --git a/tests/test_lba_extractor.py b/tests/test_lba_extractor.py new file mode 100644 index 0000000..aadc278 --- /dev/null +++ b/tests/test_lba_extractor.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +import json +from datetime import date +from pathlib import Path + +import pytest +from django.core.management import call_command + +from apps.ingestion.extractors.lba import LBASnapshotExtractor +from apps.ingestion.extractors.registry import create_extractor + + +def _load_fixture(path: str) -> dict: + fixture_path = Path(__file__).parent / "fixtures" / path + return json.loads(fixture_path.read_text(encoding="utf-8")) + + +@pytest.mark.django_db +def test_lba_extractor_normalizes_fixture_payload(tmp_path, settings): + settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json" + settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a" + settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A" + + fixture_payload = _load_fixture("lba/lba_players_stats.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = LBASnapshotExtractor(http_client=FakeClient()) + output_path = tmp_path / "lba.json" + result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13)) + + assert result.extractor_name == "lba" + assert result.source_name == "lba" + assert result.records_count == 1 + + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert payload["source_name"] == "lba" + assert payload["snapshot_date"] == "2026-03-13" + row = payload["records"][0] + assert row["competition_external_id"] == "lba-serie-a" + assert row["competition_name"] == "Lega Basket Serie A" + assert row["team_external_id"] == "team-virtus-bologna" + assert row["team_name"] == "Virtus Bologna" + assert row["player_external_id"] == "p-001" + assert row["full_name"] == "Marco Rossi" + assert row["minutes_per_game"] == 28.3 + assert row["three_pt_pct"] == 36.5 + + +@pytest.mark.django_db +def test_lba_extractor_registry_selection(settings): + settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json" + settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026" + extractor = create_extractor("lba") + assert isinstance(extractor, LBASnapshotExtractor) + + +@pytest.mark.django_db +def test_run_lba_extractor_command_writes_snapshot(tmp_path, settings, monkeypatch): + settings.EXTRACTOR_LBA_STATS_URL = "https://www.legabasket.it/public/stats.json" + settings.EXTRACTOR_LBA_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = "lba-serie-a" + settings.EXTRACTOR_LBA_COMPETITION_NAME = "Lega Basket Serie A" + + fixture_payload = _load_fixture("lba/lba_players_stats.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + monkeypatch.setattr( + "apps.ingestion.extractors.lba.ResponsibleHttpClient", + lambda **_kwargs: FakeClient(), + ) + + call_command( + "run_lba_extractor", + "--output-path", + str(tmp_path), + "--snapshot-date", + "2026-03-13", + ) + + files = list(tmp_path.glob("lba-2026-03-13.json")) + assert len(files) == 1 + payload = json.loads(files[0].read_text(encoding="utf-8")) + assert payload["source_name"] == "lba" + assert len(payload["records"]) == 1