import json from urllib.parse import urlencode import scrapy class FeedSpider(scrapy.Spider): name = "feed" allowed_domains = ["api.example.net"] api_url = "http://api.example.net:8000/api/scroll" page_size = 50 custom_settings = { "AUTOTHROTTLE_ENABLED": True, "AUTOTHROTTLE_START_DELAY": 0.25, "AUTOTHROTTLE_MAX_DELAY": 10.0, "DOWNLOAD_DELAY": 0.25, "ROBOTSTXT_OBEY": True, } def start_requests(self): params = {"limit": self.page_size} url = f"{self.api_url}?{urlencode(params)}" yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): payload = json.loads(response.text) for entry in payload.get("items", []): yield { "id": entry.get("id"), "title": entry.get("title"), } next_cursor = payload.get("next_cursor") if not next_cursor: return params = {"limit": self.page_size, "cursor": next_cursor} next_url = f"{self.api_url}?{urlencode(params)}" yield scrapy.Request(url=next_url, callback=self.parse)