The parent url got multiple nodes (quotes), each parent node got child url (author info). I am facing trouble linking the quote to author info, due to asynchronous nature of scrapy?
How can I fix this issue, here’s the code so far. Added # <---
comment for easy spot.
import scrapy class AuthorSpider(scrapy.Spider): name = 'quotes1' var = None # <---- def start_requests(self): start_urls = ['http://quotes.toscrape.com/'] yield scrapy.Request(url=start_urls[0], callback=self.parse) def parse(self, response): for quote in response.css('div.quote'): AuthorSpider.var = quote.css('div span.text::text').get() # <---- authShortLink = quote.css('small.author + a::attr(href)').get() authFullLink = response.urljoin(authShortLink) yield scrapy.Request(url=authFullLink, callback=self.parse_author) # # looping through next pages # nextPage = response.css('li.next a::attr(href)').get() # if nextPage is not None: # nextPage = response.urljoin(nextPage) # yield scrapy.Request(url=nextPage, callback=self.parse) def parse_author(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() yield { 'name': extract_with_css('h3.author-title::text'), 'birthdate': extract_with_css('.author-born-date::text'), 'quote' : AuthorSpider.var }
Please note that in order to allow duplication, added DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
in settings.py
Output I am getting presently-
[ {"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Marilyn Monroe", "birthdate": "June 01, 1926", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Jane Austen", "birthdate": "December 16, 1775", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "J.K. Rowling", "birthdate": "July 31, 1965", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Steve Martin", "birthdate": "August 14, 1945", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Eleanor Roosevelt", "birthdate": "October 11, 1884", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Thomas A. Edison", "birthdate": "February 11, 1847", "quote": "u201cA day without sunshine is like, you know, night.u201d"}, {"name": "Andru00e9 Gide", "birthdate": "November 22, 1869", "quote": "u201cA day without sunshine is like, you know, night.u201d"} ]
Thanks in advance!
Advertisement
Answer
Here is the minimal working solution. Both type of pagination is working and I use meta keyword to transfer quote item from one response to another.
import scrapy class AuthorSpider(scrapy.Spider): name = 'quotes1' start_urls = [f'https://quotes.toscrape.com/page/{x}/' .format(x) for x in range(1,11)] def parse(self, response): for quote in response.css('div.quote'): Author = quote.css('span.text::text').get() # <---- authShortLink = quote.css('small.author + a::attr(href)').get() authFullLink = response.urljoin(authShortLink) yield scrapy.Request(url=authFullLink, callback=self.parse_author, meta={'Author': Author}) # # looping through next pages # nextPage = response.css('li.next a::attr(href)').get() # abs_url = f'http://quotes.toscrape.com/{nextPage}' #yield scrapy.Request(url=abs_url, callback=self.parse) def parse_author(self, response): quote=response.meta.get('Author') yield { 'Name': response.css('h3.author-title::text').get().strip(), 'Date of birth': response.css('span.author-born-date::text').get(), 'Quote':quote, 'url':response.url}