Skip to content
Advertisement

During recursive scraping in scrapy, how extract info from multiple nodes of parent url and associated children url together?

The parent url got multiple nodes (quotes), each parent node got child url (author info). I am facing trouble linking the quote to author info, due to asynchronous nature of scrapy?

How can I fix this issue, here’s the code so far. Added # <--- comment for easy spot.

import scrapy 

class AuthorSpider(scrapy.Spider):
    name = 'quotes1'
    var = None # <----

    def start_requests(self):
        start_urls = ['http://quotes.toscrape.com/']
        yield scrapy.Request(url=start_urls[0], callback=self.parse)

    def parse(self, response):

        for quote in response.css('div.quote'):
            AuthorSpider.var = quote.css('div span.text::text').get() # <----

            authShortLink = quote.css('small.author + a::attr(href)').get()
            authFullLink = response.urljoin(authShortLink)
            yield scrapy.Request(url=authFullLink, callback=self.parse_author)

        # # looping through next pages
        # nextPage = response.css('li.next a::attr(href)').get()
        # if nextPage is not None:
        #     nextPage = response.urljoin(nextPage)
        #     yield scrapy.Request(url=nextPage, callback=self.parse)

    def parse_author(self, response):
        def extract_with_css(query):
            return response.css(query).get(default='').strip()

        yield {
            'name': extract_with_css('h3.author-title::text'),
            'birthdate': extract_with_css('.author-born-date::text'),
            'quote' : AuthorSpider.var
        }

Please note that in order to allow duplication, added DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter' in settings.py

Output I am getting presently-

[
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Marilyn Monroe", "birthdate": "June 01, 1926", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Jane Austen", "birthdate": "December 16, 1775", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "J.K. Rowling", "birthdate": "July 31, 1965", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Steve Martin", "birthdate": "August 14, 1945", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Eleanor Roosevelt", "birthdate": "October 11, 1884", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Thomas A. Edison", "birthdate": "February 11, 1847", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Andru00e9 Gide", "birthdate": "November 22, 1869", "quote": "u201cA day without sunshine is like, you know, night.u201d"}
]

Thanks in advance!

Advertisement

Answer

Here is the minimal working solution. Both type of pagination is working and I use meta keyword to transfer quote item from one response to another.

import scrapy
class AuthorSpider(scrapy.Spider):
    name = 'quotes1'
    start_urls = [f'https://quotes.toscrape.com/page/{x}/' .format(x) for x in range(1,11)]

    def parse(self, response):

        for quote in response.css('div.quote'):
            Author = quote.css('span.text::text').get()  # <----

            authShortLink = quote.css('small.author + a::attr(href)').get()
            authFullLink = response.urljoin(authShortLink)
            yield scrapy.Request(url=authFullLink, callback=self.parse_author, meta={'Author': Author})

        # # looping through next pages
        # nextPage = response.css('li.next a::attr(href)').get()
        # abs_url = f'http://quotes.toscrape.com/{nextPage}'
            #yield scrapy.Request(url=abs_url, callback=self.parse)

    def parse_author(self, response):
        quote=response.meta.get('Author')
        yield {
            'Name': response.css('h3.author-title::text').get().strip(),
            'Date of birth': response.css('span.author-born-date::text').get(),
            'Quote':quote,
            'url':response.url}
  
Advertisement