from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class CatalogCrawlSpider(CrawlSpider): name = "catalog_crawl" allowed_domains = ["app.internal.example"] start_urls = ["https://app.internal.example/products/"] rules = ( Rule( LinkExtractor( allow=(r"/products/page/\d+/$",), restrict_css=("main.catalog",), ), follow=True, ), Rule( LinkExtractor( allow=(r"/products/[^/]+/$",), restrict_css=("main.catalog",), ), callback="parse_item", ), ) def parse_item(self, response): yield { "name": response.css("h1::text").get(), "price": response.css("p.price::text").get(), "url": response.url, }