from os import environ from urllib.parse import urlencode import scrapy class ProductsSpider(scrapy.Spider): name = "products" allowed_domains = ["api.example.net"] api_endpoint = "https://api.example.net/v1/products" per_page = 100 custom_settings = { "DOWNLOAD_DELAY": 1.0, "CONCURRENT_REQUESTS_PER_DOMAIN": 2, "AUTOTHROTTLE_ENABLED": True, "AUTOTHROTTLE_START_DELAY": 1.0, "AUTOTHROTTLE_MAX_DELAY": 10.0, "FEED_EXPORT_ENCODING": "utf-8", } def api_headers(self): headers = {"Accept": "application/json"} token = environ.get("EXAMPLE_API_TOKEN") if token: headers["Authorization"] = f"Bearer {token}" return headers async def start(self): params = {"page": 1, "per_page": self.per_page} yield scrapy.Request( url=f"{self.api_endpoint}?{urlencode(params)}", headers=self.api_headers(), callback=self.parse, ) def parse(self, response): payload = response.json() for row in payload.get("products", []): yield { "id": row.get("id"), "name": row.get("name"), "price": row.get("price"), "currency": row.get("currency"), "url": row.get("url"), } next_url = payload.get("next") if next_url: yield response.follow( next_url, headers=self.api_headers(), callback=self.parse, )