import scrapy from itemloaders.processors import MapCompose, TakeFirst from scrapy.loader import ItemLoader def normalize_label(value: str) -> str: return value.removeprefix("Name:").strip() class ImageLinkItem(scrapy.Item): label = scrapy.Field() href = scrapy.Field() class ImageLinkLoader(ItemLoader): default_input_processor = MapCompose(str.strip) default_output_processor = TakeFirst() label_in = MapCompose(str.strip, normalize_label) class LoaderSpider(scrapy.Spider): name = "loader" custom_settings = { "ROBOTSTXT_OBEY": False, } start_urls = [ ( "https://docs.scrapy.org/en/latest/_static/" "selectors-sample1.html" ), ] def parse(self, response): for link in response.css("#images a"): loader = ImageLinkLoader( item=ImageLinkItem(), selector=link, ) loader.add_css("label", "::text") loader.add_css( "href", "::attr(href)", MapCompose(response.urljoin), ) yield loader.load_item()