I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback – ‘yield scrapy.Request(product_link, callback=self.parse_new_item)’ product_link should be ‘https://www.antaira.com/products/10-100Mbps/LNX-500A’ but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!
# Import the required libraries import scrapy # Import the Item class with fields # mentioned int he items.py file from ..items import AntairaItem # Spider class name class productJumper(scrapy.Spider): # Name of the spider name = 'productJumper' # The domain to be scraped allowed_domains = ['antaira.com'] # The URLs to be scraped from the domain start_urls = ['https://www.antaira.com/products/10-100Mbps'] #target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A'] # First Step: Find every div with the class 'product-container' and step into the links def parse(self, response): #product_link = response.urljoin(rel_product_link) # creating items dictionary items = AntairaItem() rel_product_link = response.css('div.center767') for url in rel_product_link: rel_product_link = response.xpath('//div[@class="product-container"]//a/@href').get(), product_link = response.urljoin('rel_product_link'), items['rel_product_link'] = rel_product_link, items['product_link'] = product_link #yield items # 2nd Step: Return a list of the all products-links that will be scrapped #yield { # take the first relative product link # 'rel_product_link' : rel_product_link, # 'product_link' : product_link, #} yield scrapy.Request(product_link, callback=self.parse_new_item) # Final Step: Run through each product and Yield the results def parse_new_item(self, response): for product in response.css('main.products'): name = product.css(('h1.product-name::text').strip(' tnr')).get() features = product.css('section.features h3 + ul').getall() overview = product.css('.products .product-overview::text').getall() main_image = product.css('div.selectors img::attr(src)').get() rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall() items['name'] = name, items['features'] = features, items['overview'] = overview, items['main_image'] = main_image, items['rel_links'] = rel_links, yield items
Advertisement
Answer
You have a couple of issues:
scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.
your second parse callback is referencing a variable items that it doesn’t have access too because it was defined in your first parse callback.
In your urljoin method you are using a string literal instead of a variable for
rel_product_link
In the example below I fixed those issues and made some additional notes
import scrapy from ..items import AntairaItem class ProductJumper(scrapy.Spider): # classes should be TitleCase name = 'productJumper' allowed_domains = ['antaira.com'] start_urls = ['https://www.antaira.com/products/10-100Mbps'] def parse(self, response): # iterate through each of the relative urls for url in response.xpath('//div[@class="product-container"]//a/@href').getall(): product_link = response.urljoin(url) # use variable yield scrapy.Request(product_link, callback=self.parse_new_item) def parse_new_item(self, response): for product in response.css('main.products'): items = AntairaItem() # Unique item for each iteration items['product_link'] = response.url # get the product link from response name = product.css(('h1.product-name::text').get().strip() features = product.css('section.features h3 + ul').getall() overview = product.css('.products .product-overview::text').getall() main_image = product.css('div.selectors img::attr(src)').get() rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall() items['name'] = name, items['features'] = features, items['overview'] = overview, items['main_image'] = main_image, items['rel_links'] = rel_links, yield items