110 lines
3.6 KiB
Python
110 lines
3.6 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from typing import Any
|
|
|
|
import requests
|
|
|
|
from .base import ExtractorFetchError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ResponsibleHttpClient:
|
|
"""
|
|
Small HTTP helper for public-source extraction:
|
|
- explicit User-Agent
|
|
- request timeout
|
|
- conservative retries
|
|
- low-frequency pacing (fixed delay between requests)
|
|
"""
|
|
|
|
RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
user_agent: str,
|
|
timeout_seconds: float,
|
|
retries: int,
|
|
retry_sleep_seconds: float,
|
|
request_delay_seconds: float,
|
|
session: requests.Session | None = None,
|
|
):
|
|
self.user_agent = user_agent
|
|
self.timeout_seconds = timeout_seconds
|
|
self.retries = retries
|
|
self.retry_sleep_seconds = retry_sleep_seconds
|
|
self.request_delay_seconds = request_delay_seconds
|
|
self.session = session or requests.Session()
|
|
self._last_request_at: float | None = None
|
|
|
|
def _pace(self) -> None:
|
|
if self.request_delay_seconds <= 0:
|
|
return
|
|
now = time.monotonic()
|
|
if self._last_request_at is None:
|
|
self._last_request_at = now
|
|
return
|
|
elapsed = now - self._last_request_at
|
|
remaining = self.request_delay_seconds - elapsed
|
|
if remaining > 0:
|
|
time.sleep(remaining)
|
|
self._last_request_at = time.monotonic()
|
|
|
|
def get_json(
|
|
self,
|
|
url: str,
|
|
*,
|
|
params: dict[str, Any] | None = None,
|
|
headers: dict[str, str] | None = None,
|
|
) -> Any:
|
|
merged_headers = {"User-Agent": self.user_agent}
|
|
if headers:
|
|
merged_headers.update(headers)
|
|
|
|
attempts = self.retries + 1
|
|
for attempt in range(1, attempts + 1):
|
|
try:
|
|
self._pace()
|
|
response = self.session.get(
|
|
url,
|
|
params=params,
|
|
headers=merged_headers,
|
|
timeout=self.timeout_seconds,
|
|
)
|
|
if response.status_code in self.RETRYABLE_STATUS_CODES:
|
|
if attempt < attempts:
|
|
logger.warning(
|
|
"extractor_http_retryable_status status=%s url=%s attempt=%s/%s",
|
|
response.status_code,
|
|
url,
|
|
attempt,
|
|
attempts,
|
|
)
|
|
time.sleep(self.retry_sleep_seconds)
|
|
continue
|
|
raise ExtractorFetchError(
|
|
f"Retryable status exhausted: status={response.status_code} url={url}"
|
|
)
|
|
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except requests.RequestException as exc:
|
|
if attempt < attempts:
|
|
logger.warning(
|
|
"extractor_http_request_retry error=%s url=%s attempt=%s/%s",
|
|
exc,
|
|
url,
|
|
attempt,
|
|
attempts,
|
|
)
|
|
time.sleep(self.retry_sleep_seconds)
|
|
continue
|
|
raise ExtractorFetchError(f"Request failed after retries: {exc}") from exc
|
|
except ValueError as exc:
|
|
raise ExtractorFetchError(f"Invalid JSON response from {url}: {exc}") from exc
|
|
|
|
raise ExtractorFetchError(f"Unexpected retry loop exit for {url}")
|