from __future__ import annotations import logging import time from typing import Any import requests from .base import ExtractorFetchError logger = logging.getLogger(__name__) class ResponsibleHttpClient: """ Small HTTP helper for public-source extraction: - explicit User-Agent - request timeout - conservative retries - low-frequency pacing (fixed delay between requests) """ RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504} def __init__( self, *, user_agent: str, timeout_seconds: float, retries: int, retry_sleep_seconds: float, request_delay_seconds: float, session: requests.Session | None = None, ): self.user_agent = user_agent self.timeout_seconds = timeout_seconds self.retries = retries self.retry_sleep_seconds = retry_sleep_seconds self.request_delay_seconds = request_delay_seconds self.session = session or requests.Session() self._last_request_at: float | None = None def _pace(self) -> None: if self.request_delay_seconds <= 0: return now = time.monotonic() if self._last_request_at is None: self._last_request_at = now return elapsed = now - self._last_request_at remaining = self.request_delay_seconds - elapsed if remaining > 0: time.sleep(remaining) self._last_request_at = time.monotonic() def get_json( self, url: str, *, params: dict[str, Any] | None = None, headers: dict[str, str] | None = None, ) -> Any: merged_headers = {"User-Agent": self.user_agent} if headers: merged_headers.update(headers) attempts = self.retries + 1 for attempt in range(1, attempts + 1): try: self._pace() response = self.session.get( url, params=params, headers=merged_headers, timeout=self.timeout_seconds, ) if response.status_code in self.RETRYABLE_STATUS_CODES: if attempt < attempts: logger.warning( "extractor_http_retryable_status status=%s url=%s attempt=%s/%s", response.status_code, url, attempt, attempts, ) time.sleep(self.retry_sleep_seconds) continue raise ExtractorFetchError( f"Retryable status exhausted: status={response.status_code} url={url}" ) response.raise_for_status() return response.json() except requests.RequestException as exc: if attempt < attempts: logger.warning( "extractor_http_request_retry error=%s url=%s attempt=%s/%s", exc, url, attempt, attempts, ) time.sleep(self.retry_sleep_seconds) continue raise ExtractorFetchError(f"Request failed after retries: {exc}") from exc except ValueError as exc: raise ExtractorFetchError(f"Invalid JSON response from {url}: {exc}") from exc raise ExtractorFetchError(f"Unexpected retry loop exit for {url}")