feat(v2): add snapshot extractor framework and run command
This commit is contained in:
109
apps/ingestion/extractors/http.py
Normal file
109
apps/ingestion/extractors/http.py
Normal file
@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
from .base import ExtractorFetchError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ResponsibleHttpClient:
|
||||
"""
|
||||
Small HTTP helper for public-source extraction:
|
||||
- explicit User-Agent
|
||||
- request timeout
|
||||
- conservative retries
|
||||
- low-frequency pacing (fixed delay between requests)
|
||||
"""
|
||||
|
||||
RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
user_agent: str,
|
||||
timeout_seconds: float,
|
||||
retries: int,
|
||||
retry_sleep_seconds: float,
|
||||
request_delay_seconds: float,
|
||||
session: requests.Session | None = None,
|
||||
):
|
||||
self.user_agent = user_agent
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.retries = retries
|
||||
self.retry_sleep_seconds = retry_sleep_seconds
|
||||
self.request_delay_seconds = request_delay_seconds
|
||||
self.session = session or requests.Session()
|
||||
self._last_request_at: float | None = None
|
||||
|
||||
def _pace(self) -> None:
|
||||
if self.request_delay_seconds <= 0:
|
||||
return
|
||||
now = time.monotonic()
|
||||
if self._last_request_at is None:
|
||||
self._last_request_at = now
|
||||
return
|
||||
elapsed = now - self._last_request_at
|
||||
remaining = self.request_delay_seconds - elapsed
|
||||
if remaining > 0:
|
||||
time.sleep(remaining)
|
||||
self._last_request_at = time.monotonic()
|
||||
|
||||
def get_json(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
params: dict[str, Any] | None = None,
|
||||
headers: dict[str, str] | None = None,
|
||||
) -> Any:
|
||||
merged_headers = {"User-Agent": self.user_agent}
|
||||
if headers:
|
||||
merged_headers.update(headers)
|
||||
|
||||
attempts = self.retries + 1
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
self._pace()
|
||||
response = self.session.get(
|
||||
url,
|
||||
params=params,
|
||||
headers=merged_headers,
|
||||
timeout=self.timeout_seconds,
|
||||
)
|
||||
if response.status_code in self.RETRYABLE_STATUS_CODES:
|
||||
if attempt < attempts:
|
||||
logger.warning(
|
||||
"extractor_http_retryable_status status=%s url=%s attempt=%s/%s",
|
||||
response.status_code,
|
||||
url,
|
||||
attempt,
|
||||
attempts,
|
||||
)
|
||||
time.sleep(self.retry_sleep_seconds)
|
||||
continue
|
||||
raise ExtractorFetchError(
|
||||
f"Retryable status exhausted: status={response.status_code} url={url}"
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.RequestException as exc:
|
||||
if attempt < attempts:
|
||||
logger.warning(
|
||||
"extractor_http_request_retry error=%s url=%s attempt=%s/%s",
|
||||
exc,
|
||||
url,
|
||||
attempt,
|
||||
attempts,
|
||||
)
|
||||
time.sleep(self.retry_sleep_seconds)
|
||||
continue
|
||||
raise ExtractorFetchError(f"Request failed after retries: {exc}") from exc
|
||||
except ValueError as exc:
|
||||
raise ExtractorFetchError(f"Invalid JSON response from {url}: {exc}") from exc
|
||||
|
||||
raise ExtractorFetchError(f"Unexpected retry loop exit for {url}")
|
||||
Reference in New Issue
Block a user