The parent url got multiple nodes (quotes), each parent node got child url (author info). I am facing trouble linking the quote to author info, due to asynchronous nature of scrapy?
How can I fix this issue, here’s the code so far. Added # <--- comment for easy spot.
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'quotes1'
var = None # <----
def start_requests(self):
start_urls = ['http://quotes.toscrape.com/']
yield scrapy.Request(url=start_urls[0], callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
AuthorSpider.var = quote.css('div span.text::text').get() # <----
authShortLink = quote.css('small.author + a::attr(href)').get()
authFullLink = response.urljoin(authShortLink)
yield scrapy.Request(url=authFullLink, callback=self.parse_author)
# # looping through next pages
# nextPage = response.css('li.next a::attr(href)').get()
# if nextPage is not None:
# nextPage = response.urljoin(nextPage)
# yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_author(self, response):
def extract_with_css(query):
return response.css(query).get(default='').strip()
yield {
'name': extract_with_css('h3.author-title::text'),
'birthdate': extract_with_css('.author-born-date::text'),
'quote' : AuthorSpider.var
}
Please note that in order to allow duplication, added DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter' in settings.py
Output I am getting presently-
[
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Marilyn Monroe", "birthdate": "June 01, 1926", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Jane Austen", "birthdate": "December 16, 1775", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "J.K. Rowling", "birthdate": "July 31, 1965", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Steve Martin", "birthdate": "August 14, 1945", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Eleanor Roosevelt", "birthdate": "October 11, 1884", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Thomas A. Edison", "birthdate": "February 11, 1847", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
{"name": "Andru00e9 Gide", "birthdate": "November 22, 1869", "quote": "u201cA day without sunshine is like, you know, night.u201d"}
]
Thanks in advance!
Advertisement
Answer
Here is the minimal working solution. Both type of pagination is working and I use meta keyword to transfer quote item from one response to another.
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'quotes1'
start_urls = [f'https://quotes.toscrape.com/page/{x}/' .format(x) for x in range(1,11)]
def parse(self, response):
for quote in response.css('div.quote'):
Author = quote.css('span.text::text').get() # <----
authShortLink = quote.css('small.author + a::attr(href)').get()
authFullLink = response.urljoin(authShortLink)
yield scrapy.Request(url=authFullLink, callback=self.parse_author, meta={'Author': Author})
# # looping through next pages
# nextPage = response.css('li.next a::attr(href)').get()
# abs_url = f'http://quotes.toscrape.com/{nextPage}'
#yield scrapy.Request(url=abs_url, callback=self.parse)
def parse_author(self, response):
quote=response.meta.get('Author')
yield {
'Name': response.css('h3.author-title::text').get().strip(),
'Date of birth': response.css('span.author-born-date::text').get(),
'Quote':quote,
'url':response.url}