diff --git a/.env.example b/.env.example index cbe118b..86ad4fc 100644 --- a/.env.example +++ b/.env.example @@ -49,6 +49,10 @@ EXTRACTOR_LBA_STATS_URL= EXTRACTOR_LBA_SEASON_LABEL=2025-2026 EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID=lba-serie-a EXTRACTOR_LBA_COMPETITION_NAME=Lega Basket Serie A +EXTRACTOR_BCL_STATS_URL= +EXTRACTOR_BCL_SEASON_LABEL=2025-2026 +EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID=bcl +EXTRACTOR_BCL_COMPETITION_NAME=Basketball Champions League # Future optional scheduler loop settings (not enabled in base v2 runtime) SCHEDULER_ENABLED=0 diff --git a/README.md b/README.md index e1814a6..6235ebf 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,7 @@ Extractor pipeline: Built-in extractor in this phase: - `public_json_snapshot` (generic JSON feed extractor for MVP usage) - `lba` (Lega Basket Serie A MVP extractor) +- `bcl` (Basketball Champions League MVP extractor) Run extractor: @@ -210,6 +211,12 @@ Run only the LBA extractor: docker compose exec web python manage.py run_lba_extractor ``` +Run only the BCL extractor: + +```bash +docker compose exec web python manage.py run_bcl_extractor +``` + Extractor environment variables: - `EXTRACTOR_USER_AGENT` - `EXTRACTOR_HTTP_TIMEOUT_SECONDS` @@ -223,6 +230,10 @@ Extractor environment variables: - `EXTRACTOR_LBA_SEASON_LABEL` - `EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID` - `EXTRACTOR_LBA_COMPETITION_NAME` +- `EXTRACTOR_BCL_STATS_URL` +- `EXTRACTOR_BCL_SEASON_LABEL` +- `EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID` +- `EXTRACTOR_BCL_COMPETITION_NAME` Notes: - extraction is intentionally low-frequency and uses retries conservatively @@ -239,7 +250,18 @@ Notes: - season is configured by `EXTRACTOR_LBA_SEASON_LABEL` - parser supports payload keys: `records`, `data`, `players`, `items` - normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`) -- no BCL support in this task +- no live HTTP calls in tests; tests use fixtures/mocked responses only + +### BCL extractor assumptions and limitations (MVP) + +- `source_name` is fixed to `bcl` +- the extractor expects one stable public JSON payload that includes player/team/stat rows +- competition is configured by environment and emitted as: + - `competition_external_id` from `EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID` + - `competition_name` from `EXTRACTOR_BCL_COMPETITION_NAME` +- season is configured by `EXTRACTOR_BCL_SEASON_LABEL` +- parser supports payload keys: `records`, `data`, `players`, `items` +- normalization supports nested `player` and `team` objects with common stat aliases (`gp/mpg/ppg/rpg/apg/spg/bpg/tov`) - no live HTTP calls in tests; tests use fixtures/mocked responses only ## Migration and Superuser Commands diff --git a/apps/ingestion/extractors/__init__.py b/apps/ingestion/extractors/__init__.py index b58d36a..edf2a94 100644 --- a/apps/ingestion/extractors/__init__.py +++ b/apps/ingestion/extractors/__init__.py @@ -1,3 +1,4 @@ +from .bcl import BCLSnapshotExtractor from .base import ( BaseSnapshotExtractor, ExtractionResult, @@ -12,6 +13,7 @@ from .registry import available_extractors, create_extractor __all__ = [ "BaseSnapshotExtractor", + "BCLSnapshotExtractor", "LBASnapshotExtractor", "ExtractionResult", "ExtractorError", diff --git a/apps/ingestion/extractors/bcl.py b/apps/ingestion/extractors/bcl.py new file mode 100644 index 0000000..442d291 --- /dev/null +++ b/apps/ingestion/extractors/bcl.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +from typing import Any + +from django.conf import settings + +from .base import BaseSnapshotExtractor, ExtractorConfigError, ExtractorNormalizationError, ExtractorParseError +from .http import ResponsibleHttpClient + + +def _first_non_empty(record: dict[str, Any], *keys: str) -> Any: + for key in keys: + value = record.get(key) + if value not in (None, ""): + return value + return None + + +class BCLSnapshotExtractor(BaseSnapshotExtractor): + """ + Basketball Champions League MVP extractor. + + Scope is intentionally conservative: + - one configured public stats endpoint + - one configured season label + - normalized player-season rows only + """ + + extractor_name = "bcl" + source_name = "bcl" + + def __init__(self, *, http_client: ResponsibleHttpClient | None = None): + self.url = settings.EXTRACTOR_BCL_STATS_URL.strip() + self.season_label = settings.EXTRACTOR_BCL_SEASON_LABEL.strip() + self.competition_external_id = settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID.strip() + self.competition_name = settings.EXTRACTOR_BCL_COMPETITION_NAME.strip() + self.include_raw_payload = settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD + self.http_client = http_client or ResponsibleHttpClient( + user_agent=settings.EXTRACTOR_USER_AGENT, + timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS, + retries=settings.EXTRACTOR_HTTP_RETRIES, + retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS, + request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS, + ) + + if not self.url: + raise ExtractorConfigError("EXTRACTOR_BCL_STATS_URL is required for bcl extractor.") + if not self.season_label: + raise ExtractorConfigError("EXTRACTOR_BCL_SEASON_LABEL is required for bcl extractor.") + if not self.competition_external_id: + raise ExtractorConfigError("EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID is required.") + if not self.competition_name: + raise ExtractorConfigError("EXTRACTOR_BCL_COMPETITION_NAME is required.") + + def fetch(self) -> Any: + return self.http_client.get_json(self.url) + + def parse(self, payload: Any) -> list[dict[str, Any]]: + if isinstance(payload, list): + return payload + if not isinstance(payload, dict): + raise ExtractorParseError("BCL payload must be a JSON object or array.") + + for key in ("records", "data", "players", "items"): + rows = payload.get(key) + if isinstance(rows, list): + return rows + + raise ExtractorParseError("BCL payload must contain one of: records, data, players, items.") + + def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]: + player_obj = source_record.get("player") if isinstance(source_record.get("player"), dict) else {} + team_obj = source_record.get("team") if isinstance(source_record.get("team"), dict) else {} + + full_name = _first_non_empty( + source_record, + "full_name", + "player_name", + "name", + ) or _first_non_empty(player_obj, "full_name", "name") + first_name = _first_non_empty(source_record, "first_name") or _first_non_empty(player_obj, "first_name") + last_name = _first_non_empty(source_record, "last_name") or _first_non_empty(player_obj, "last_name") + player_external_id = _first_non_empty( + source_record, "player_external_id", "player_id", "athlete_id" + ) or _first_non_empty(player_obj, "id", "player_id") + team_external_id = _first_non_empty(source_record, "team_external_id", "team_id") or _first_non_empty( + team_obj, "id", "team_id" + ) + team_name = _first_non_empty(source_record, "team_name", "team") or _first_non_empty(team_obj, "name") + + normalized = { + "competition_external_id": self.competition_external_id, + "competition_name": self.competition_name, + "season": self.season_label, + "team_external_id": team_external_id, + "team_name": team_name, + "player_external_id": player_external_id, + "full_name": full_name, + "first_name": first_name, + "last_name": last_name, + "birth_date": _first_non_empty(source_record, "birth_date") or _first_non_empty( + player_obj, "birth_date", "dob" + ), + "nationality": _first_non_empty(source_record, "nationality") + or _first_non_empty(player_obj, "nationality", "country"), + "height_cm": _first_non_empty(source_record, "height_cm") or _first_non_empty(player_obj, "height_cm"), + "weight_kg": _first_non_empty(source_record, "weight_kg") or _first_non_empty(player_obj, "weight_kg"), + "position": _first_non_empty(source_record, "position") or _first_non_empty(player_obj, "position"), + "role": _first_non_empty(source_record, "role"), + "games_played": _first_non_empty(source_record, "games_played", "gp"), + "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"), + "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"), + "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"), + "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"), + "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"), + "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"), + "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"), + "fg_pct": _first_non_empty(source_record, "fg_pct", "fg_percentage"), + "three_pt_pct": _first_non_empty( + source_record, "three_pt_pct", "three_point_pct", "3p_pct", "three_pct" + ), + "ft_pct": _first_non_empty(source_record, "ft_pct", "ft_percentage"), + } + + missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")] + if missing: + raise ExtractorNormalizationError(f"bcl row missing required fields: {', '.join(sorted(missing))}") + + normalized["team_external_id"] = str(normalized["team_external_id"]).strip() + normalized["player_external_id"] = str(normalized["player_external_id"]).strip() + normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip() + normalized["season"] = str(normalized["season"]).strip() + + if self.include_raw_payload: + normalized["raw_payload"] = source_record + + return normalized diff --git a/apps/ingestion/extractors/registry.py b/apps/ingestion/extractors/registry.py index 19e64aa..6c1ad18 100644 --- a/apps/ingestion/extractors/registry.py +++ b/apps/ingestion/extractors/registry.py @@ -1,10 +1,12 @@ from __future__ import annotations +from .bcl import BCLSnapshotExtractor from .base import BaseSnapshotExtractor, ExtractorConfigError from .lba import LBASnapshotExtractor from .public_json import PublicJsonSnapshotExtractor EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = { + BCLSnapshotExtractor.extractor_name: BCLSnapshotExtractor, LBASnapshotExtractor.extractor_name: LBASnapshotExtractor, PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor, } diff --git a/apps/ingestion/management/commands/run_bcl_extractor.py b/apps/ingestion/management/commands/run_bcl_extractor.py new file mode 100644 index 0000000..d5af1c3 --- /dev/null +++ b/apps/ingestion/management/commands/run_bcl_extractor.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from django.core.management.base import BaseCommand, CommandError +from django.utils.dateparse import parse_date + +from apps.ingestion.extractors import ExtractorError, create_extractor + + +class Command(BaseCommand): + help = "Run only the BCL extractor and emit an import-ready snapshot JSON." + + def add_arguments(self, parser): + parser.add_argument( + "--output-path", + dest="output_path", + default=None, + help="Directory or .json path to write output (default incoming dir).", + ) + parser.add_argument( + "--snapshot-date", + dest="snapshot_date", + default=None, + help="Override snapshot date in YYYY-MM-DD format.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Validate without writing output.", + ) + parser.add_argument( + "--indent", + type=int, + default=2, + help="JSON indent level for emitted file.", + ) + + def handle(self, *args, **options): + snapshot_date = None + if options["snapshot_date"]: + snapshot_date = parse_date(options["snapshot_date"]) + if snapshot_date is None: + raise CommandError("--snapshot-date must be YYYY-MM-DD.") + + try: + extractor = create_extractor("bcl") + result = extractor.run( + output_path=options["output_path"], + snapshot_date=snapshot_date, + write_output=not options["dry_run"], + indent=options["indent"], + ) + except ExtractorError as exc: + raise CommandError(str(exc)) from exc + + output = str(result.output_path) if result.output_path else "" + self.stdout.write( + self.style.SUCCESS( + f"BCL extractor completed: source={result.source_name} " + f"date={result.snapshot_date} records={result.records_count} output={output}" + ) + ) diff --git a/config/settings/base.py b/config/settings/base.py index 1f9b410..94b15d4 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -169,6 +169,10 @@ EXTRACTOR_LBA_STATS_URL = os.getenv("EXTRACTOR_LBA_STATS_URL", "").strip() EXTRACTOR_LBA_SEASON_LABEL = os.getenv("EXTRACTOR_LBA_SEASON_LABEL", "").strip() EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID = os.getenv("EXTRACTOR_LBA_COMPETITION_EXTERNAL_ID", "lba-serie-a").strip() EXTRACTOR_LBA_COMPETITION_NAME = os.getenv("EXTRACTOR_LBA_COMPETITION_NAME", "Lega Basket Serie A").strip() +EXTRACTOR_BCL_STATS_URL = os.getenv("EXTRACTOR_BCL_STATS_URL", "").strip() +EXTRACTOR_BCL_SEASON_LABEL = os.getenv("EXTRACTOR_BCL_SEASON_LABEL", "").strip() +EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = os.getenv("EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID", "bcl").strip() +EXTRACTOR_BCL_COMPETITION_NAME = os.getenv("EXTRACTOR_BCL_COMPETITION_NAME", "Basketball Champions League").strip() if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0: raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.") diff --git a/tests/fixtures/bcl/bcl_players_stats.json b/tests/fixtures/bcl/bcl_players_stats.json new file mode 100644 index 0000000..d281f1e --- /dev/null +++ b/tests/fixtures/bcl/bcl_players_stats.json @@ -0,0 +1,32 @@ +{ + "data": [ + { + "player": { + "id": "bcl-player-42", + "name": "John Carter", + "first_name": "John", + "last_name": "Carter", + "birth_date": "1999-07-14", + "nationality": "US", + "height_cm": 198, + "weight_kg": 95, + "position": "SF" + }, + "team": { + "id": "bcl-team-murcia", + "name": "UCAM Murcia" + }, + "gp": 12, + "mpg": 29.1, + "ppg": 16.4, + "rpg": 5.8, + "apg": 2.7, + "spg": 1.5, + "bpg": 0.6, + "tov": 2.3, + "fg_pct": 48.1, + "three_pct": 37.2, + "ft_pct": 81.4 + } + ] +} diff --git a/tests/test_bcl_extractor.py b/tests/test_bcl_extractor.py new file mode 100644 index 0000000..5130d35 --- /dev/null +++ b/tests/test_bcl_extractor.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +import json +from datetime import date +from pathlib import Path + +import pytest +from django.core.management import call_command + +from apps.ingestion.extractors.bcl import BCLSnapshotExtractor +from apps.ingestion.extractors.registry import create_extractor + + +def _load_fixture(path: str) -> dict: + fixture_path = Path(__file__).parent / "fixtures" / path + return json.loads(fixture_path.read_text(encoding="utf-8")) + + +@pytest.mark.django_db +def test_bcl_extractor_normalizes_fixture_payload(tmp_path, settings): + settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" + settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl" + settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League" + + fixture_payload = _load_fixture("bcl/bcl_players_stats.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + extractor = BCLSnapshotExtractor(http_client=FakeClient()) + output_path = tmp_path / "bcl.json" + result = extractor.run(output_path=output_path, snapshot_date=date(2026, 3, 13)) + + assert result.extractor_name == "bcl" + assert result.source_name == "bcl" + assert result.records_count == 1 + + payload = json.loads(output_path.read_text(encoding="utf-8")) + assert payload["source_name"] == "bcl" + assert payload["snapshot_date"] == "2026-03-13" + row = payload["records"][0] + assert row["competition_external_id"] == "bcl" + assert row["competition_name"] == "Basketball Champions League" + assert row["team_external_id"] == "bcl-team-murcia" + assert row["team_name"] == "UCAM Murcia" + assert row["player_external_id"] == "bcl-player-42" + assert row["full_name"] == "John Carter" + assert row["minutes_per_game"] == 29.1 + assert row["three_pt_pct"] == 37.2 + + +@pytest.mark.django_db +def test_bcl_extractor_registry_selection(settings): + settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" + settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026" + extractor = create_extractor("bcl") + assert isinstance(extractor, BCLSnapshotExtractor) + + +@pytest.mark.django_db +def test_run_bcl_extractor_command_writes_snapshot(tmp_path, settings, monkeypatch): + settings.EXTRACTOR_BCL_STATS_URL = "https://www.championsleague.basketball/public/stats.json" + settings.EXTRACTOR_BCL_SEASON_LABEL = "2025-2026" + settings.EXTRACTOR_BCL_COMPETITION_EXTERNAL_ID = "bcl" + settings.EXTRACTOR_BCL_COMPETITION_NAME = "Basketball Champions League" + + fixture_payload = _load_fixture("bcl/bcl_players_stats.json") + + class FakeClient: + def get_json(self, *_args, **_kwargs): + return fixture_payload + + monkeypatch.setattr( + "apps.ingestion.extractors.bcl.ResponsibleHttpClient", + lambda **_kwargs: FakeClient(), + ) + + call_command( + "run_bcl_extractor", + "--output-path", + str(tmp_path), + "--snapshot-date", + "2026-03-13", + ) + + files = list(tmp_path.glob("bcl-2026-03-13.json")) + assert len(files) == 1 + payload = json.loads(files[0].read_text(encoding="utf-8")) + assert payload["source_name"] == "bcl" + assert len(payload["records"]) == 1