I’m quite new to webscraping. I’m trying to crawl at novel reader website, to get the novel info and chapter content, so the way i do it is by creating 2 spider, one to fetch novel information and another one to fetch content of the chapter
import scrapy class BookSpider(scrapy.Spider): name = "book" def __init__(self, books=[], **kwargs): if isinstance(books,str): books = [books] self.start_urls = [f'https://daonovel.com/novel/{book}/' for book in sorted(books)] super().__init__(**kwargs) def parse(self, response): # self.remove_content(response.css("div.post-title h1 span")) fullurl = response.url url = fullurl.split("/")[-2] title = response.css("div.post-title h1::text").extract() title = title[len(title)-1].strip() authors = response.css('div.author-content a::text').getall() genres = response.css('div.genres-content a::text').getall() release = response.css('div.post-status div.post-content_item:nth-child(1) div.summary-content::text').get().strip() status = response.css('div.post-status div.post-content_item:nth-child(2) div.summary-content::text').get().strip() summary = response.css('div.summary__content p').getall() chapters = response.css('ul.version-chap li a::attr(href)').extract() chapters.reverse() return { 'fullurl' : fullurl, 'url' : url, 'title' : title, 'authors' : authors, 'genres' : genres, 'status' : status, 'release' : release, 'summary' : summary, 'chapters' : chapters } class ChapterSpider(scrapy.Spider): name = "chapter" def __init__(self, book="", chapters=[], **kwargs): if isinstance(chapters,str): chapters = [chapters] self.book = book self.start_urls = [f'https://daonovel.com/novel/{book}/{chapter}/' for chapter in chapters] super().__init__(**kwargs) def parse(self, response): title = response.css("ol.breadcrumb li.active::text").get().strip() container = response.css("div.cha-words p").getall() if response.css("div.cha-words p").getall() else response.css("div.text-left p").getall() content = [] for p in container: content.append(str(p)) return { 'title' : title, 'content' : content, 'book_url': self.book, 'url' : response.url.split("/")[-2] }
After that i created a collector to collect and process all of the data from the spider
from scrapy import signals class Collector(): def __init__(self, process, books=[]): self.process = process if isinstance(books, str): books = [books] self.books = books self.books_data = [] def create_crawler(self, spider, function, **kwargs): # we need Crawler instance to access signals crawler = self.process.create_crawler(spider) crawler.signals.connect(function, signal=signals.item_scraped) x = self.process.crawl(crawler, **kwargs) return x def process_book_data(self, item, response, spider): item['authors'] = [author.strip() for author in item['authors']] item['genres'] = [genre.strip() for genre in item['genres']] summary = [line for line in item['summary'] if not any(word in line.lower() for word in ("wuxiaworld", "disclaimer"))] item['summary'] = str("n").join(summary) item['chapters'] = [chapter.replace(item['fullurl'], '').replace('/', '') for chapter in item['chapters']] self.books_data.append(item) def process_chapter_data(self, item, response, spider): item['content'] = str("n").join(item['content']) for book in self.books_data: if book['url'] == item['book_url']: book['chapters'][book['chapters'].index(item['url'])] = item def crawl_books(self): return self.create_crawler(BookSpider, self.process_book_data, books=self.books) def crawl_chapters(self, book, chapters): return self.create_crawler(ChapterSpider, self.process_chapter_data, book=book, chapters=chapters)
If i put the chapter manually before process.start()
from scrapy.crawler import CrawlerProcess process = CrawlerProcess() collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world") collector.crawl_books() collector.crawl_chapters("a-stay-at-home-dads-restaurant-in-an-alternate-world", ['chapter-1', 'chapter-2', 'chapter-3', 'chapter-4', 'chapter-5']) # put chapter manually process.start() for book in (collector.books_data): for k,v in book.items(): print(k,v)
It works but this isn’t the purpose of this script
Now my question is how do i make the chapter spider run after the book spider finished collecting the data? Here is my try that didn’t work
from scrapy.crawler import CrawlerProcess process = CrawlerProcess() collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world") collector.crawl_books() process.start() print(collector.books_data) # this work for book in collector.books_data: collector.crawl_chapters(book['url'], book['chapters']) # this didn't work print("Chapters ==>", collector.books_data)
if i add process.start() before ‘print(“Chapters ==>”, collector.chapters_data)’ it creates error of twisted.internet.error.ReactorNotRestartable
I’ve read this SO question Scrapy – Reactor not Restartable but didn’t know how to implement it on my code
Advertisement
Answer
I’d suggest to change spider architecture since scrapy isn’t supposed to chain spiders(it’s possible of course but it’s bad practice in general), it’s supposed to chain requests within the same spider.
Your problem is caused by the fact that scrapy designed to grab flat list of items, while you need nested one like book = {'title': ..., 'chapters': [{some chapter data}, ...]}
I’d suggest next architecture for your spider:
def parse(self, response): # parse book data here book_item = { 'fullurl' : fullurl, 'url' : url, 'title' : title, 'authors' : authors, 'genres' : genres, 'status' : status, 'release' : release, 'summary' : summary, 'chapters' : [] } chapter_urls = ...list of book chapter urls here. chapter_url = chapter_urls.pop() yield Request( url=chapter_url, callback=self.parse_chapter meta={'book': book_item, 'chapter_urls': chapter_urls} ) def parse_chapter(self, response): book = response.meta['book'] chapter_urls = response.meta['chapter_urls'] # parse chapter data here chapter = { 'title' : title, 'content' : content, 'book_url': self.book, 'url' : response.url.split("/")[-2] } book['chapters'].append(chapter) if not chapter_urls: yield book else: chapter_url = chapter_urls.pop() yield Request( url=chapter_url, callback=self.parse_chapter meta={'book': book, 'chapter_urls': chapter_urls} )
This will produce books
entities with nested chapters inside.
Hope it will help even though it’s not quite exact answer to your question. Good luck (:
Second edition:
class YourSpider(Spider): books = {} ... def parse(self, response): # Get book info here. book_item = { 'fullurl' : fullurl, 'url' : url, 'title' : title, 'authors' : authors, 'genres' : genres, 'status' : status, 'release' : release, 'summary' : summary, 'chapters' : [] } self.books[book_item['title']] = book_item chapter_urls = [..list of chapter urls] chapter_url = chapter_urls.pop() # This will trigger multiple request async for chapter_url in chapter_urls: yield scrapy.Request( url=chapter_url, callback=self.parse_chapter, meta={'book': book} ) def parse_chapter(self, response): book_title = response.meta['book_title'] # parse chapter data here chapter = { 'title' : title, 'content' : content, 'book_url': self.book, 'url' : response.url.split("/")[-2] } self.books[book_title].append(chapter) yield self.books[book_title]