import scrapy from scrapy.exceptions import CloseSpider from scrapy.http import FormRequest class PrivateQuotesSpider(scrapy.Spider): name = "private_quotes" allowed_domains = ["quotes.toscrape.com"] login_url = "https://quotes.toscrape.com/login" start_url = "https://quotes.toscrape.com/" def __init__(self, username=None, password=None, max_pages="2", *args, **kwargs): super().__init__(*args, **kwargs) if not username or not password: raise CloseSpider("Pass -a username=... -a password=...") self.username = username self.password = password self.max_pages = int(max_pages) async def start(self): yield scrapy.Request(self.login_url, callback=self.parse_login, dont_filter=True) def parse_login(self, response): yield FormRequest.from_response( response, formcss="form", formdata={ "username": self.username, "password": self.password, }, callback=self.after_login, dont_filter=True, ) def after_login(self, response): if "Logout" not in response.text: raise CloseSpider("Login failed; logout link not found.") yield response.follow( self.start_url, callback=self.parse_quotes, cb_kwargs={"page_number": 1}, dont_filter=True, ) def parse_quotes(self, response, page_number): if "Logout" not in response.text: raise CloseSpider("Session lost; page no longer shows Logout.") for quote in response.css("div.quote"): yield { "page": page_number, "author": quote.css("small.author::text").get(default="").strip(), "text": quote.css("span.text::text").get(default="").strip(), "authenticated": True, "url": response.url, } next_href = response.css("li.next a::attr(href)").get() if next_href and page_number < self.max_pages: yield response.follow( next_href, callback=self.parse_quotes, cb_kwargs={"page_number": page_number + 1}, )