import logging from scrapy import signals from scrapy.http import HtmlResponse from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait logger = logging.getLogger(__name__) class SeleniumDownloaderMiddleware: def __init__(self, wait_seconds, driver_arguments): self.wait_seconds = wait_seconds self.driver_arguments = driver_arguments self.driver = None @classmethod def from_crawler(cls, crawler): middleware = cls( wait_seconds=crawler.settings.getint("SELENIUM_WAIT_SECONDS", 10), driver_arguments=crawler.settings.getlist("SELENIUM_DRIVER_ARGUMENTS"), ) crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened) crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) return middleware def spider_opened(self, spider): options = Options() for argument in self.driver_arguments: options.add_argument(argument) self.driver = webdriver.Chrome(options=options) def spider_closed(self, spider, reason): if self.driver is not None: self.driver.quit() self.driver = None def process_request(self, request): if not request.meta.get("selenium"): return None if self.driver is None: raise RuntimeError("Selenium WebDriver is not initialized.") self.driver.get(request.url) wait_css = request.meta.get("selenium_wait_css") if wait_css: try: WebDriverWait(self.driver, self.wait_seconds).until( EC.presence_of_element_located((By.CSS_SELECTOR, wait_css)) ) except TimeoutException: logger.warning("Timed out waiting for selector: %s", wait_css) return HtmlResponse( url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8", request=request, )