From 850e4de71b429ed210d3c54f6790ce75494f0717 Mon Sep 17 00:00:00 2001
From: Alfredo Di Stasio <alfredo.distasio@unina.it>
Date: Fri, 13 Mar 2026 14:24:54 +0100
Subject: [PATCH] feat(v2): add snapshot extractor framework and run command

---
 .env.example                                  |  10 +
 README.md                                     |  49 ++++
 apps/ingestion/extractors/__init__.py         |  22 ++
 apps/ingestion/extractors/base.py             | 150 ++++++++++++
 apps/ingestion/extractors/http.py             | 109 +++++++++
 apps/ingestion/extractors/public_json.py      | 130 ++++++++++
 apps/ingestion/extractors/registry.py         |  22 ++
 .../management/commands/run_extractor.py      |  63 +++++
 config/settings/base.py                       |  19 ++
 tests/test_extractors_framework.py            | 222 ++++++++++++++++++
 10 files changed, 796 insertions(+)
 create mode 100644 apps/ingestion/extractors/__init__.py
 create mode 100644 apps/ingestion/extractors/base.py
 create mode 100644 apps/ingestion/extractors/http.py
 create mode 100644 apps/ingestion/extractors/public_json.py
 create mode 100644 apps/ingestion/extractors/registry.py
 create mode 100644 apps/ingestion/management/commands/run_extractor.py
 create mode 100644 tests/test_extractors_framework.py

diff --git a/.env.example b/.env.example
index d9a4cec..be47342 100644
--- a/.env.example
+++ b/.env.example
@@ -36,6 +36,16 @@ STATIC_DATASET_INCOMING_DIR=/app/snapshots/incoming
 STATIC_DATASET_ARCHIVE_DIR=/app/snapshots/archive
 STATIC_DATASET_FAILED_DIR=/app/snapshots/failed
 
+# Extractor framework (fetch -> parse -> normalize -> emit snapshot)
+EXTRACTOR_USER_AGENT=HoopScoutBot/2.0 (+https://younerd.org)
+EXTRACTOR_HTTP_TIMEOUT_SECONDS=15
+EXTRACTOR_HTTP_RETRIES=2
+EXTRACTOR_RETRY_SLEEP_SECONDS=1.0
+EXTRACTOR_REQUEST_DELAY_SECONDS=0.5
+EXTRACTOR_PUBLIC_JSON_URL=
+EXTRACTOR_PUBLIC_SOURCE_NAME=public_json_source
+EXTRACTOR_INCLUDE_RAW_PAYLOAD=0
+
 # Future optional scheduler loop settings (not enabled in base v2 runtime)
 SCHEDULER_ENABLED=0
 SCHEDULER_INTERVAL_SECONDS=900
diff --git a/README.md b/README.md
index d8d0fec..7e6e1bb 100644
--- a/README.md
+++ b/README.md
@@ -169,6 +169,55 @@ Import history is visible in Django admin:
 - `ImportRun`
 - `ImportFile`
 
+## Extractor Framework (v2)
+
+v2 keeps extraction and import as two separate steps:
+
+1. **Extractors** fetch public source content and emit normalized JSON snapshots.
+2. **Importer** (`import_snapshots`) validates and upserts those snapshots into PostgreSQL.
+
+Extractor pipeline:
+- `fetch` (public endpoint/page requests with conservative HTTP behavior)
+- `parse` (source-specific structure)
+- `normalize` (map to HoopScout snapshot schema)
+- `emit` (write JSON file to incoming directory or custom path)
+
+Built-in extractor in this phase:
+- `public_json_snapshot` (generic JSON feed extractor for MVP usage)
+
+Run extractor:
+
+```bash
+docker compose exec web python manage.py run_extractor public_json_snapshot
+```
+
+Run extractor with explicit output path (debugging):
+
+```bash
+docker compose exec web python manage.py run_extractor public_json_snapshot --output-path /app/snapshots/incoming
+```
+
+Dry-run validation (no file write):
+
+```bash
+docker compose exec web python manage.py run_extractor public_json_snapshot --dry-run
+```
+
+Extractor environment variables:
+- `EXTRACTOR_USER_AGENT`
+- `EXTRACTOR_HTTP_TIMEOUT_SECONDS`
+- `EXTRACTOR_HTTP_RETRIES`
+- `EXTRACTOR_RETRY_SLEEP_SECONDS`
+- `EXTRACTOR_REQUEST_DELAY_SECONDS`
+- `EXTRACTOR_PUBLIC_JSON_URL`
+- `EXTRACTOR_PUBLIC_SOURCE_NAME`
+- `EXTRACTOR_INCLUDE_RAW_PAYLOAD`
+
+Notes:
+- extraction is intentionally low-frequency and uses retries conservatively
+- only public pages/endpoints should be targeted
+- emitted snapshots must match the same schema consumed by `import_snapshots`
+
 ## Migration and Superuser Commands
 
 ```bash
diff --git a/apps/ingestion/extractors/__init__.py b/apps/ingestion/extractors/__init__.py
new file mode 100644
index 0000000..d459cd8
--- /dev/null
+++ b/apps/ingestion/extractors/__init__.py
@@ -0,0 +1,22 @@
+from .base import (
+    BaseSnapshotExtractor,
+    ExtractionResult,
+    ExtractorConfigError,
+    ExtractorError,
+    ExtractorFetchError,
+    ExtractorNormalizationError,
+    ExtractorParseError,
+)
+from .registry import available_extractors, create_extractor
+
+__all__ = [
+    "BaseSnapshotExtractor",
+    "ExtractionResult",
+    "ExtractorError",
+    "ExtractorConfigError",
+    "ExtractorFetchError",
+    "ExtractorParseError",
+    "ExtractorNormalizationError",
+    "available_extractors",
+    "create_extractor",
+]
diff --git a/apps/ingestion/extractors/base.py b/apps/ingestion/extractors/base.py
new file mode 100644
index 0000000..96c6d84
--- /dev/null
+++ b/apps/ingestion/extractors/base.py
@@ -0,0 +1,150 @@
+from __future__ import annotations
+
+import json
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+from django.conf import settings
+
+from apps.ingestion.snapshots import SnapshotSchemaValidator
+
+logger = logging.getLogger(__name__)
+
+
+class ExtractorError(RuntimeError):
+    """Base extractor exception."""
+
+
+class ExtractorConfigError(ExtractorError):
+    """Raised when extractor configuration is invalid."""
+
+
+class ExtractorFetchError(ExtractorError):
+    """Raised when remote/source fetch fails."""
+
+
+class ExtractorParseError(ExtractorError):
+    """Raised when fetched content cannot be parsed."""
+
+
+class ExtractorNormalizationError(ExtractorError):
+    """Raised when source rows cannot be normalized."""
+
+
+@dataclass
+class ExtractionResult:
+    extractor_name: str
+    source_name: str
+    snapshot_date: date
+    records_count: int
+    output_path: Path | None
+
+
+class BaseSnapshotExtractor(ABC):
+    extractor_name = "base"
+    source_name = "unknown_source"
+
+    @abstractmethod
+    def fetch(self) -> Any:
+        """Fetch source payload from a source endpoint/resource."""
+
+    @abstractmethod
+    def parse(self, payload: Any) -> list[dict[str, Any]]:
+        """Parse fetched payload into source-specific record dictionaries."""
+
+    @abstractmethod
+    def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
+        """Normalize a source record into HoopScout snapshot record shape."""
+
+    def resolve_snapshot_date(self) -> date:
+        return date.today()
+
+    def normalize_records(self, source_records: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        normalized: list[dict[str, Any]] = []
+        for idx, row in enumerate(source_records):
+            if not isinstance(row, dict):
+                raise ExtractorNormalizationError(f"Parsed record at index {idx} must be an object.")
+            normalized.append(self.normalize_record(row))
+        return normalized
+
+    def build_snapshot(self, records: list[dict[str, Any]], snapshot_date: date) -> dict[str, Any]:
+        return {
+            "source_name": self.source_name,
+            "snapshot_date": snapshot_date.isoformat(),
+            "records": records,
+        }
+
+    def default_output_dir(self) -> Path:
+        return Path(settings.STATIC_DATASET_INCOMING_DIR)
+
+    def snapshot_filename(self, snapshot_date: date) -> str:
+        return f"{self.extractor_name}-{snapshot_date.isoformat()}.json"
+
+    def emit_snapshot(
+        self,
+        snapshot: dict[str, Any],
+        *,
+        output_path: str | Path | None = None,
+        indent: int = 2,
+    ) -> Path:
+        if output_path is None:
+            destination = self.default_output_dir()
+            destination.mkdir(parents=True, exist_ok=True)
+            file_path = destination / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"]))
+        else:
+            target = Path(output_path)
+            if target.suffix.lower() == ".json":
+                target.parent.mkdir(parents=True, exist_ok=True)
+                file_path = target
+            else:
+                target.mkdir(parents=True, exist_ok=True)
+                file_path = target / self.snapshot_filename(date.fromisoformat(snapshot["snapshot_date"]))
+
+        file_path.write_text(json.dumps(snapshot, indent=indent, ensure_ascii=True), encoding="utf-8")
+        return file_path
+
+    def run(
+        self,
+        *,
+        output_path: str | Path | None = None,
+        snapshot_date: date | None = None,
+        write_output: bool = True,
+        indent: int = 2,
+    ) -> ExtractionResult:
+        payload = self.fetch()
+        source_rows = self.parse(payload)
+        normalized_rows = self.normalize_records(source_rows)
+        resolved_snapshot_date = snapshot_date or self.resolve_snapshot_date()
+        snapshot = self.build_snapshot(normalized_rows, resolved_snapshot_date)
+        validated = SnapshotSchemaValidator.validate(snapshot)
+        snapshot["records"] = validated.records
+
+        output_file: Path | None = None
+        if write_output:
+            output_file = self.emit_snapshot(snapshot, output_path=output_path, indent=indent)
+            logger.info(
+                "extractor_snapshot_written extractor=%s source=%s records=%s path=%s",
+                self.extractor_name,
+                validated.source_name,
+                len(validated.records),
+                output_file,
+            )
+        else:
+            logger.info(
+                "extractor_snapshot_validated extractor=%s source=%s records=%s write_output=0",
+                self.extractor_name,
+                validated.source_name,
+                len(validated.records),
+            )
+
+        return ExtractionResult(
+            extractor_name=self.extractor_name,
+            source_name=validated.source_name,
+            snapshot_date=validated.snapshot_date,
+            records_count=len(validated.records),
+            output_path=output_file,
+        )
diff --git a/apps/ingestion/extractors/http.py b/apps/ingestion/extractors/http.py
new file mode 100644
index 0000000..a25a677
--- /dev/null
+++ b/apps/ingestion/extractors/http.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any
+
+import requests
+
+from .base import ExtractorFetchError
+
+logger = logging.getLogger(__name__)
+
+
+class ResponsibleHttpClient:
+    """
+    Small HTTP helper for public-source extraction:
+    - explicit User-Agent
+    - request timeout
+    - conservative retries
+    - low-frequency pacing (fixed delay between requests)
+    """
+
+    RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
+
+    def __init__(
+        self,
+        *,
+        user_agent: str,
+        timeout_seconds: float,
+        retries: int,
+        retry_sleep_seconds: float,
+        request_delay_seconds: float,
+        session: requests.Session | None = None,
+    ):
+        self.user_agent = user_agent
+        self.timeout_seconds = timeout_seconds
+        self.retries = retries
+        self.retry_sleep_seconds = retry_sleep_seconds
+        self.request_delay_seconds = request_delay_seconds
+        self.session = session or requests.Session()
+        self._last_request_at: float | None = None
+
+    def _pace(self) -> None:
+        if self.request_delay_seconds <= 0:
+            return
+        now = time.monotonic()
+        if self._last_request_at is None:
+            self._last_request_at = now
+            return
+        elapsed = now - self._last_request_at
+        remaining = self.request_delay_seconds - elapsed
+        if remaining > 0:
+            time.sleep(remaining)
+        self._last_request_at = time.monotonic()
+
+    def get_json(
+        self,
+        url: str,
+        *,
+        params: dict[str, Any] | None = None,
+        headers: dict[str, str] | None = None,
+    ) -> Any:
+        merged_headers = {"User-Agent": self.user_agent}
+        if headers:
+            merged_headers.update(headers)
+
+        attempts = self.retries + 1
+        for attempt in range(1, attempts + 1):
+            try:
+                self._pace()
+                response = self.session.get(
+                    url,
+                    params=params,
+                    headers=merged_headers,
+                    timeout=self.timeout_seconds,
+                )
+                if response.status_code in self.RETRYABLE_STATUS_CODES:
+                    if attempt < attempts:
+                        logger.warning(
+                            "extractor_http_retryable_status status=%s url=%s attempt=%s/%s",
+                            response.status_code,
+                            url,
+                            attempt,
+                            attempts,
+                        )
+                        time.sleep(self.retry_sleep_seconds)
+                        continue
+                    raise ExtractorFetchError(
+                        f"Retryable status exhausted: status={response.status_code} url={url}"
+                    )
+
+                response.raise_for_status()
+                return response.json()
+            except requests.RequestException as exc:
+                if attempt < attempts:
+                    logger.warning(
+                        "extractor_http_request_retry error=%s url=%s attempt=%s/%s",
+                        exc,
+                        url,
+                        attempt,
+                        attempts,
+                    )
+                    time.sleep(self.retry_sleep_seconds)
+                    continue
+                raise ExtractorFetchError(f"Request failed after retries: {exc}") from exc
+            except ValueError as exc:
+                raise ExtractorFetchError(f"Invalid JSON response from {url}: {exc}") from exc
+
+        raise ExtractorFetchError(f"Unexpected retry loop exit for {url}")
diff --git a/apps/ingestion/extractors/public_json.py b/apps/ingestion/extractors/public_json.py
new file mode 100644
index 0000000..acb973d
--- /dev/null
+++ b/apps/ingestion/extractors/public_json.py
@@ -0,0 +1,130 @@
+from __future__ import annotations
+
+from typing import Any
+
+from django.conf import settings
+
+from .base import (
+    BaseSnapshotExtractor,
+    ExtractorConfigError,
+    ExtractorNormalizationError,
+    ExtractorParseError,
+)
+from .http import ResponsibleHttpClient
+
+
+def _first_non_empty(record: dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        if key in record and record[key] not in (None, ""):
+            return record[key]
+    return None
+
+
+class PublicJsonSnapshotExtractor(BaseSnapshotExtractor):
+    """
+    Generic public JSON extractor for MVP v2.
+
+    This extractor is intentionally generic and lightweight:
+    - fetch from one public JSON endpoint
+    - parse list-like payloads
+    - normalize into HoopScout snapshot schema
+    """
+
+    extractor_name = "public_json_snapshot"
+
+    def __init__(
+        self,
+        *,
+        url: str | None = None,
+        source_name: str | None = None,
+        include_raw_payload: bool | None = None,
+        http_client: ResponsibleHttpClient | None = None,
+    ):
+        self.url = (url or settings.EXTRACTOR_PUBLIC_JSON_URL).strip()
+        self.source_name = (source_name or settings.EXTRACTOR_PUBLIC_SOURCE_NAME).strip()
+        self.include_raw_payload = (
+            settings.EXTRACTOR_INCLUDE_RAW_PAYLOAD if include_raw_payload is None else include_raw_payload
+        )
+        if not self.url:
+            raise ExtractorConfigError("EXTRACTOR_PUBLIC_JSON_URL is required for public_json_snapshot extractor.")
+        if not self.source_name:
+            raise ExtractorConfigError("EXTRACTOR_PUBLIC_SOURCE_NAME must not be empty.")
+
+        self.http_client = http_client or ResponsibleHttpClient(
+            user_agent=settings.EXTRACTOR_USER_AGENT,
+            timeout_seconds=settings.EXTRACTOR_HTTP_TIMEOUT_SECONDS,
+            retries=settings.EXTRACTOR_HTTP_RETRIES,
+            retry_sleep_seconds=settings.EXTRACTOR_RETRY_SLEEP_SECONDS,
+            request_delay_seconds=settings.EXTRACTOR_REQUEST_DELAY_SECONDS,
+        )
+
+    def fetch(self) -> Any:
+        return self.http_client.get_json(self.url)
+
+    def parse(self, payload: Any) -> list[dict[str, Any]]:
+        if isinstance(payload, list):
+            return payload
+        if not isinstance(payload, dict):
+            raise ExtractorParseError("Fetched payload must be a JSON object or array.")
+
+        rows = payload.get("records")
+        if isinstance(rows, list):
+            return rows
+
+        data_rows = payload.get("data")
+        if isinstance(data_rows, list):
+            return data_rows
+
+        raise ExtractorParseError("Payload must contain 'records' or 'data' list.")
+
+    def normalize_record(self, source_record: dict[str, Any]) -> dict[str, Any]:
+        normalized = {
+            "competition_external_id": _first_non_empty(
+                source_record, "competition_external_id", "competition_id", "league_id"
+            ),
+            "competition_name": _first_non_empty(
+                source_record, "competition_name", "competition", "league_name"
+            ),
+            "season": _first_non_empty(source_record, "season", "season_label", "season_name"),
+            "team_external_id": _first_non_empty(source_record, "team_external_id", "team_id"),
+            "team_name": _first_non_empty(source_record, "team_name", "team"),
+            "player_external_id": _first_non_empty(source_record, "player_external_id", "player_id"),
+            "full_name": _first_non_empty(source_record, "full_name", "player_name", "name"),
+            "first_name": _first_non_empty(source_record, "first_name"),
+            "last_name": _first_non_empty(source_record, "last_name"),
+            "birth_date": _first_non_empty(source_record, "birth_date"),
+            "nationality": _first_non_empty(source_record, "nationality", "nationality_code"),
+            "height_cm": _first_non_empty(source_record, "height_cm"),
+            "weight_kg": _first_non_empty(source_record, "weight_kg"),
+            "position": _first_non_empty(source_record, "position"),
+            "role": _first_non_empty(source_record, "role"),
+            "games_played": _first_non_empty(source_record, "games_played", "gp"),
+            "minutes_per_game": _first_non_empty(source_record, "minutes_per_game", "mpg"),
+            "points_per_game": _first_non_empty(source_record, "points_per_game", "ppg"),
+            "rebounds_per_game": _first_non_empty(source_record, "rebounds_per_game", "rpg"),
+            "assists_per_game": _first_non_empty(source_record, "assists_per_game", "apg"),
+            "steals_per_game": _first_non_empty(source_record, "steals_per_game", "spg"),
+            "blocks_per_game": _first_non_empty(source_record, "blocks_per_game", "bpg"),
+            "turnovers_per_game": _first_non_empty(source_record, "turnovers_per_game", "tov"),
+            "fg_pct": _first_non_empty(source_record, "fg_pct"),
+            "three_pt_pct": _first_non_empty(
+                source_record, "three_pt_pct", "three_point_pct", "three_pct", "3p_pct"
+            ),
+            "ft_pct": _first_non_empty(source_record, "ft_pct"),
+        }
+
+        missing = [key for key, value in normalized.items() if key != "role" and value in (None, "")]
+        if missing:
+            raise ExtractorNormalizationError(
+                f"public_json_snapshot row missing required fields: {', '.join(sorted(missing))}"
+            )
+
+        normalized["season"] = str(normalized["season"]).strip()
+        normalized["competition_external_id"] = str(normalized["competition_external_id"]).strip()
+        normalized["team_external_id"] = str(normalized["team_external_id"]).strip()
+        normalized["player_external_id"] = str(normalized["player_external_id"]).strip()
+
+        if self.include_raw_payload:
+            normalized["raw_payload"] = source_record
+
+        return normalized
diff --git a/apps/ingestion/extractors/registry.py b/apps/ingestion/extractors/registry.py
new file mode 100644
index 0000000..bd960dc
--- /dev/null
+++ b/apps/ingestion/extractors/registry.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from .base import BaseSnapshotExtractor, ExtractorConfigError
+from .public_json import PublicJsonSnapshotExtractor
+
+EXTRACTOR_REGISTRY: dict[str, type[BaseSnapshotExtractor]] = {
+    PublicJsonSnapshotExtractor.extractor_name: PublicJsonSnapshotExtractor,
+}
+
+
+def available_extractors() -> list[str]:
+    return sorted(EXTRACTOR_REGISTRY.keys())
+
+
+def create_extractor(extractor_name: str) -> BaseSnapshotExtractor:
+    try:
+        extractor_cls = EXTRACTOR_REGISTRY[extractor_name]
+    except KeyError as exc:
+        raise ExtractorConfigError(
+            f"Unknown extractor '{extractor_name}'. Available: {', '.join(available_extractors())}"
+        ) from exc
+    return extractor_cls()
diff --git a/apps/ingestion/management/commands/run_extractor.py b/apps/ingestion/management/commands/run_extractor.py
new file mode 100644
index 0000000..0afd77a
--- /dev/null
+++ b/apps/ingestion/management/commands/run_extractor.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+from django.core.management.base import BaseCommand, CommandError
+from django.utils.dateparse import parse_date
+
+from apps.ingestion.extractors import ExtractorError, available_extractors, create_extractor
+
+
+class Command(BaseCommand):
+    help = "Run a snapshot extractor and emit importable JSON snapshots."
+
+    def add_arguments(self, parser):
+        parser.add_argument("extractor_name", choices=available_extractors())
+        parser.add_argument(
+            "--output-path",
+            dest="output_path",
+            default=None,
+            help="Directory or .json file path where snapshot should be written. Defaults to incoming dir.",
+        )
+        parser.add_argument(
+            "--snapshot-date",
+            dest="snapshot_date",
+            default=None,
+            help="Override snapshot date in YYYY-MM-DD format.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="Validate extraction/normalization without writing snapshot file.",
+        )
+        parser.add_argument(
+            "--indent",
+            type=int,
+            default=2,
+            help="JSON indent level for emitted snapshots.",
+        )
+
+    def handle(self, *args, **options):
+        snapshot_date = None
+        if options["snapshot_date"]:
+            snapshot_date = parse_date(options["snapshot_date"])
+            if snapshot_date is None:
+                raise CommandError("--snapshot-date must be YYYY-MM-DD.")
+
+        try:
+            extractor = create_extractor(options["extractor_name"])
+            result = extractor.run(
+                output_path=options["output_path"],
+                snapshot_date=snapshot_date,
+                write_output=not options["dry_run"],
+                indent=options["indent"],
+            )
+        except ExtractorError as exc:
+            raise CommandError(str(exc)) from exc
+
+        output_path = str(result.output_path) if result.output_path else "<dry-run>"
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"Extractor {result.extractor_name} completed: "
+                f"source={result.source_name} date={result.snapshot_date} "
+                f"records={result.records_count} output={output_path}"
+            )
+        )
diff --git a/config/settings/base.py b/config/settings/base.py
index e5cd367..52015f0 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -156,6 +156,25 @@ STATIC_DATASET_FAILED_DIR = os.getenv(
     os.getenv("SNAPSHOT_FAILED_DIR", str(BASE_DIR / "snapshots" / "failed")),
 )
 
+# v2 extractor framework runtime settings.
+EXTRACTOR_USER_AGENT = os.getenv("EXTRACTOR_USER_AGENT", "HoopScoutBot/2.0 (+https://younerd.org)")
+EXTRACTOR_HTTP_TIMEOUT_SECONDS = float(os.getenv("EXTRACTOR_HTTP_TIMEOUT_SECONDS", "15"))
+EXTRACTOR_HTTP_RETRIES = int(os.getenv("EXTRACTOR_HTTP_RETRIES", "2"))
+EXTRACTOR_RETRY_SLEEP_SECONDS = float(os.getenv("EXTRACTOR_RETRY_SLEEP_SECONDS", "1.0"))
+EXTRACTOR_REQUEST_DELAY_SECONDS = float(os.getenv("EXTRACTOR_REQUEST_DELAY_SECONDS", "0.5"))
+EXTRACTOR_PUBLIC_JSON_URL = os.getenv("EXTRACTOR_PUBLIC_JSON_URL", "").strip()
+EXTRACTOR_PUBLIC_SOURCE_NAME = os.getenv("EXTRACTOR_PUBLIC_SOURCE_NAME", "public_json_source").strip()
+EXTRACTOR_INCLUDE_RAW_PAYLOAD = env_bool("EXTRACTOR_INCLUDE_RAW_PAYLOAD", False)
+
+if EXTRACTOR_HTTP_TIMEOUT_SECONDS <= 0:
+    raise ImproperlyConfigured("EXTRACTOR_HTTP_TIMEOUT_SECONDS must be > 0.")
+if EXTRACTOR_HTTP_RETRIES < 0:
+    raise ImproperlyConfigured("EXTRACTOR_HTTP_RETRIES must be >= 0.")
+if EXTRACTOR_RETRY_SLEEP_SECONDS < 0:
+    raise ImproperlyConfigured("EXTRACTOR_RETRY_SLEEP_SECONDS must be >= 0.")
+if EXTRACTOR_REQUEST_DELAY_SECONDS < 0:
+    raise ImproperlyConfigured("EXTRACTOR_REQUEST_DELAY_SECONDS must be >= 0.")
+
 # Optional scheduler command settings for future v2 snapshot jobs.
 SCHEDULER_ENABLED = env_bool("SCHEDULER_ENABLED", False)
 SCHEDULER_INTERVAL_SECONDS = int(os.getenv("SCHEDULER_INTERVAL_SECONDS", "900"))
diff --git a/tests/test_extractors_framework.py b/tests/test_extractors_framework.py
new file mode 100644
index 0000000..cb0dd74
--- /dev/null
+++ b/tests/test_extractors_framework.py
@@ -0,0 +1,222 @@
+from __future__ import annotations
+
+import json
+from datetime import date
+
+import pytest
+from django.core.management import call_command
+
+from apps.ingestion.extractors.base import BaseSnapshotExtractor
+from apps.ingestion.extractors.http import ResponsibleHttpClient
+from apps.ingestion.extractors.public_json import PublicJsonSnapshotExtractor
+
+
+class DummyExtractor(BaseSnapshotExtractor):
+    extractor_name = "dummy"
+    source_name = "dummy_source"
+
+    def fetch(self):
+        return {"rows": [{"name": "Jane Doe"}]}
+
+    def parse(self, payload):
+        return payload["rows"]
+
+    def normalize_record(self, source_record):
+        return {
+            "competition_external_id": "comp-1",
+            "competition_name": "League One",
+            "season": "2025-2026",
+            "team_external_id": "team-1",
+            "team_name": "Team One",
+            "player_external_id": "player-1",
+            "full_name": source_record["name"],
+            "first_name": "Jane",
+            "last_name": "Doe",
+            "birth_date": "2000-01-01",
+            "nationality": "US",
+            "height_cm": 180,
+            "weight_kg": 75,
+            "position": "SG",
+            "games_played": 10,
+            "minutes_per_game": 30.0,
+            "points_per_game": 15.0,
+            "rebounds_per_game": 4.0,
+            "assists_per_game": 3.0,
+            "steals_per_game": 1.2,
+            "blocks_per_game": 0.4,
+            "turnovers_per_game": 2.0,
+            "fg_pct": 45.0,
+            "three_pt_pct": 35.0,
+            "ft_pct": 82.0,
+        }
+
+
+class _FakeResponse:
+    def __init__(self, payload, status_code=200):
+        self._payload = payload
+        self.status_code = status_code
+
+    def raise_for_status(self):
+        if self.status_code >= 400:
+            raise RuntimeError(f"status={self.status_code}")
+
+    def json(self):
+        return self._payload
+
+
+@pytest.mark.django_db
+def test_base_extractor_run_writes_snapshot_file(tmp_path, settings):
+    settings.STATIC_DATASET_INCOMING_DIR = str(tmp_path / "incoming")
+    extractor = DummyExtractor()
+    result = extractor.run(snapshot_date=date(2026, 3, 13))
+
+    assert result.records_count == 1
+    assert result.source_name == "dummy_source"
+    assert result.output_path is not None
+    assert result.output_path.exists()
+
+    payload = json.loads(result.output_path.read_text(encoding="utf-8"))
+    assert payload["source_name"] == "dummy_source"
+    assert payload["snapshot_date"] == "2026-03-13"
+    assert payload["records"][0]["full_name"] == "Jane Doe"
+
+
+@pytest.mark.django_db
+def test_public_json_extractor_normalizes_common_field_aliases(tmp_path):
+    class FakeClient:
+        def get_json(self, *_args, **_kwargs):
+            return {
+                "records": [
+                    {
+                        "competition_id": 99,
+                        "competition_name": "National League",
+                        "season": 2025,
+                        "team_id": 10,
+                        "team_name": "Blue Team",
+                        "player_id": 123,
+                        "player_name": "John Smith",
+                        "first_name": "John",
+                        "last_name": "Smith",
+                        "birth_date": "2001-05-12",
+                        "nationality": "US",
+                        "height_cm": 198,
+                        "weight_kg": 96,
+                        "position": "SF",
+                        "gp": 20,
+                        "mpg": 28.5,
+                        "ppg": 14.2,
+                        "rpg": 5.1,
+                        "apg": 3.2,
+                        "spg": 1.1,
+                        "bpg": 0.5,
+                        "tov": 1.9,
+                        "fg_pct": 47.3,
+                        "three_pct": 36.1,
+                        "ft_pct": 80.0,
+                    }
+                ]
+            }
+
+    extractor = PublicJsonSnapshotExtractor(
+        url="https://example.com/public-feed.json",
+        source_name="test_public_feed",
+        http_client=FakeClient(),
+    )
+    output_file = tmp_path / "public.json"
+    result = extractor.run(output_path=output_file, snapshot_date=date(2026, 3, 13))
+
+    assert result.records_count == 1
+    payload = json.loads(output_file.read_text(encoding="utf-8"))
+    row = payload["records"][0]
+    assert row["competition_external_id"] == "99"
+    assert row["team_external_id"] == "10"
+    assert row["player_external_id"] == "123"
+    assert row["full_name"] == "John Smith"
+    assert row["three_pt_pct"] == 36.1
+
+
+@pytest.mark.django_db
+def test_run_extractor_management_command_writes_snapshot(tmp_path, settings):
+    settings.EXTRACTOR_PUBLIC_JSON_URL = "https://example.com/feed.json"
+    settings.EXTRACTOR_PUBLIC_SOURCE_NAME = "cmd_test_source"
+    output_dir = tmp_path / "snapshots"
+
+    class FakeClient:
+        def get_json(self, *_args, **_kwargs):
+            return {
+                "records": [
+                    {
+                        "competition_external_id": "comp-a",
+                        "competition_name": "Alpha League",
+                        "season": "2025-2026",
+                        "team_external_id": "team-a",
+                        "team_name": "Alpha Team",
+                        "player_external_id": "player-a",
+                        "full_name": "Alpha Player",
+                        "first_name": "Alpha",
+                        "last_name": "Player",
+                        "birth_date": "2000-04-01",
+                        "nationality": "US",
+                        "height_cm": 190,
+                        "weight_kg": 88,
+                        "position": "PG",
+                        "games_played": 12,
+                        "minutes_per_game": 31.0,
+                        "points_per_game": 17.0,
+                        "rebounds_per_game": 4.0,
+                        "assists_per_game": 6.0,
+                        "steals_per_game": 1.3,
+                        "blocks_per_game": 0.1,
+                        "turnovers_per_game": 2.4,
+                        "fg_pct": 44.0,
+                        "three_pt_pct": 37.0,
+                        "ft_pct": 79.0,
+                    }
+                ]
+            }
+
+    monkeypatch = pytest.MonkeyPatch()
+    monkeypatch.setattr(
+        "apps.ingestion.extractors.public_json.ResponsibleHttpClient",
+        lambda **_kwargs: FakeClient(),
+    )
+    try:
+        call_command(
+            "run_extractor",
+            "public_json_snapshot",
+            "--output-path",
+            str(output_dir),
+            "--snapshot-date",
+            "2026-03-13",
+        )
+    finally:
+        monkeypatch.undo()
+
+    files = list(output_dir.glob("public_json_snapshot-2026-03-13.json"))
+    assert len(files) == 1
+    payload = json.loads(files[0].read_text(encoding="utf-8"))
+    assert payload["source_name"] == "cmd_test_source"
+    assert payload["records"][0]["full_name"] == "Alpha Player"
+
+
+def test_http_client_retries_on_retryable_status(monkeypatch):
+    class FakeSession:
+        def __init__(self):
+            self.calls = 0
+
+        def get(self, *_args, **_kwargs):
+            self.calls += 1
+            if self.calls == 1:
+                return _FakeResponse({"error": "busy"}, status_code=429)
+            return _FakeResponse({"records": []}, status_code=200)
+
+    client = ResponsibleHttpClient(
+        user_agent="test-agent",
+        timeout_seconds=5,
+        retries=1,
+        retry_sleep_seconds=0,
+        request_delay_seconds=0,
+        session=FakeSession(),
+    )
+    payload = client.get_json("https://example.com/feed.json")
+    assert payload == {"records": []}