The parent url got multiple nodes (quotes), each parent node got child url (author info). I am facing trouble linking the quote to author info, due to asynchronous nature of scrapy?
How can I fix this issue, here’s the code so far. Added # <---
comment for easy spot.
JavaScript
x
36
36
1
import scrapy
2
3
class AuthorSpider(scrapy.Spider):
4
name = 'quotes1'
5
var = None # <----
6
7
def start_requests(self):
8
start_urls = ['http://quotes.toscrape.com/']
9
yield scrapy.Request(url=start_urls[0], callback=self.parse)
10
11
def parse(self, response):
12
13
for quote in response.css('div.quote'):
14
AuthorSpider.var = quote.css('div span.text::text').get() # <----
15
16
authShortLink = quote.css('small.author + a::attr(href)').get()
17
authFullLink = response.urljoin(authShortLink)
18
yield scrapy.Request(url=authFullLink, callback=self.parse_author)
19
20
# # looping through next pages
21
# nextPage = response.css('li.next a::attr(href)').get()
22
# if nextPage is not None:
23
# nextPage = response.urljoin(nextPage)
24
# yield scrapy.Request(url=nextPage, callback=self.parse)
25
26
def parse_author(self, response):
27
def extract_with_css(query):
28
return response.css(query).get(default='').strip()
29
30
yield {
31
'name': extract_with_css('h3.author-title::text'),
32
'birthdate': extract_with_css('.author-born-date::text'),
33
'quote' : AuthorSpider.var
34
}
35
36
Please note that in order to allow duplication, added DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
in settings.py
Output I am getting presently-
JavaScript
1
13
13
1
[
2
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
3
{"name": "Marilyn Monroe", "birthdate": "June 01, 1926", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
4
{"name": "Jane Austen", "birthdate": "December 16, 1775", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
5
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
6
{"name": "J.K. Rowling", "birthdate": "July 31, 1965", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
7
{"name": "Albert Einstein", "birthdate": "March 14, 1879", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
8
{"name": "Steve Martin", "birthdate": "August 14, 1945", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
9
{"name": "Eleanor Roosevelt", "birthdate": "October 11, 1884", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
10
{"name": "Thomas A. Edison", "birthdate": "February 11, 1847", "quote": "u201cA day without sunshine is like, you know, night.u201d"},
11
{"name": "Andru00e9 Gide", "birthdate": "November 22, 1869", "quote": "u201cA day without sunshine is like, you know, night.u201d"}
12
]
13
Thanks in advance!
Advertisement
Answer
Here is the minimal working solution. Both type of pagination is working and I use meta keyword to transfer quote item from one response to another.
JavaScript
1
28
28
1
import scrapy
2
class AuthorSpider(scrapy.Spider):
3
name = 'quotes1'
4
start_urls = [f'https://quotes.toscrape.com/page/{x}/' .format(x) for x in range(1,11)]
5
6
def parse(self, response):
7
8
for quote in response.css('div.quote'):
9
Author = quote.css('span.text::text').get() # <----
10
11
authShortLink = quote.css('small.author + a::attr(href)').get()
12
authFullLink = response.urljoin(authShortLink)
13
yield scrapy.Request(url=authFullLink, callback=self.parse_author, meta={'Author': Author})
14
15
# # looping through next pages
16
# nextPage = response.css('li.next a::attr(href)').get()
17
# abs_url = f'http://quotes.toscrape.com/{nextPage}'
18
#yield scrapy.Request(url=abs_url, callback=self.parse)
19
20
def parse_author(self, response):
21
quote=response.meta.get('Author')
22
yield {
23
'Name': response.css('h3.author-title::text').get().strip(),
24
'Date of birth': response.css('span.author-born-date::text').get(),
25
'Quote':quote,
26
'url':response.url}
27
28