import scrapy def join_clean_text(texts): return " ".join(text.strip() for text in texts if text.strip()) class EventsSpider(scrapy.Spider): name = "events" allowed_domains = ["events.example.net"] start_urls = ["https://events.example.net/calendar/"] def parse(self, response): for href in response.css("article.event-card a.event-link::attr(href)").getall(): yield response.follow(href, callback=self.parse_event) next_href = response.css( "a[rel='next']::attr(href), a.next::attr(href)" ).get() if next_href: yield response.follow(next_href, callback=self.parse) def parse_event(self, response): registration_href = response.css( "a.register::attr(href), a[href*='register']::attr(href)" ).get() yield { "title": response.css("h1::text").get(default="").strip(), "start_time": response.css("time::attr(datetime)").get(default="").strip(), "location": response.css(".event-location::text").get(default="").strip(), "summary": join_clean_text( response.css(".event-summary *::text").getall() ), "registration_url": ( response.urljoin(registration_href) if registration_href else "" ), "url": response.url, }