Skip to content
Advertisement

Trying to add multiple yields into a single json file using Scrapy

I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback – ‘yield scrapy.Request(product_link, callback=self.parse_new_item)’ product_link should be ‘https://www.antaira.com/products/10-100Mbps/LNX-500A’ but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!

# Import the required libraries
import scrapy

# Import the Item class with fields
# mentioned int he items.py file
from ..items import AntairaItem
  
# Spider class name
class productJumper(scrapy.Spider):
    
    # Name of the spider
    name = 'productJumper'

    # The domain to be scraped
    allowed_domains = ['antaira.com']
      
    # The URLs to be scraped from the domain
    start_urls = ['https://www.antaira.com/products/10-100Mbps']
    #target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A']
  
    # First Step: Find every div with the class 'product-container' and step into the links
    def parse(self, response):
        #product_link = response.urljoin(rel_product_link)
        
        # creating items dictionary
        items = AntairaItem()
        
        rel_product_link = response.css('div.center767')
        for url in rel_product_link:
            rel_product_link = response.xpath('//div[@class="product-container"]//a/@href').get(),
            product_link = response.urljoin('rel_product_link'),
            items['rel_product_link'] = rel_product_link,
            items['product_link'] = product_link
            
            #yield items

    # 2nd Step: Return a list of the all products-links that will be scrapped
            #yield {
            #       take the first relative product link
            #        'rel_product_link' : rel_product_link,
            #        'product_link'  :   product_link,
            #}
            
            yield scrapy.Request(product_link, callback=self.parse_new_item)
    
    # Final Step: Run through each product and Yield the results
        def parse_new_item(self, response):
            for product in response.css('main.products'):
    
                name = product.css(('h1.product-name::text').strip(' tnr')).get()
                features = product.css('section.features h3 + ul').getall()
                overview =   product.css('.products .product-overview::text').getall()
                main_image = product.css('div.selectors img::attr(src)').get()
                rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
            
                items['name'] = name,
                items['features'] = features,
                items['overview'] = overview,
                items['main_image'] = main_image,
                items['rel_links'] = rel_links,
            
                yield items

Advertisement

Answer

You have a couple of issues:

  1. scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.

  2. your second parse callback is referencing a variable items that it doesn’t have access too because it was defined in your first parse callback.

  3. In your urljoin method you are using a string literal instead of a variable for rel_product_link

In the example below I fixed those issues and made some additional notes

import scrapy
from ..items import AntairaItem


class ProductJumper(scrapy.Spider):  # classes should be TitleCase

    name = 'productJumper'
    allowed_domains = ['antaira.com']
    start_urls = ['https://www.antaira.com/products/10-100Mbps']

    def parse(self, response):
        # iterate through each of the relative urls
        for url in response.xpath('//div[@class="product-container"]//a/@href').getall():
            product_link = response.urljoin(url)  # use variable
            yield scrapy.Request(product_link, callback=self.parse_new_item)

    def parse_new_item(self, response):
        for product in response.css('main.products'):
            items = AntairaItem() # Unique item for each iteration
            items['product_link'] = response.url # get the product link from response
            name = product.css(('h1.product-name::text').get().strip()
            features = product.css('section.features h3 + ul').getall()
            overview =   product.css('.products .product-overview::text').getall()
            main_image = product.css('div.selectors img::attr(src)').get()
            rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
            items['name'] = name,
            items['features'] = features,
            items['overview'] = overview,
            items['main_image'] = main_image,
            items['rel_links'] = rel_links,
            yield items
User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement