I am trying to run a scrapy script with splash
, as I want to scrape a javascript
based webpage, but with no results. When I execute this script with python command, I get this error: crochet._eventloop.TimeoutError
. In addition the print statement in parse method never printed, so I consider something is wrong with SplashRequest
. The code that I wrote in order to implement this is that:
import logging import scrapy from scrapy import signals from scrapy.crawler import CrawlerRunner from scrapy.item import Item, Field from scrapy.signalmanager import dispatcher from scrapy_splash import SplashRequest from crochet import setup, wait_for setup() # logging.getLogger('scrapy').propagate = False class GooglePatentsSpider(scrapy.spiders.Spider): name = "google_patents_spider" allowed_domains = ['patents.google.com'] script = ''' function main(splash, args) splash.private_mode_enabled = false assert(splash:go(args.url)) splash:wait(5) return splash:html() end ''' @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(GooglePatentsSpider, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped) return spider def item_scraped(self, item): return item def start_requests(self): for url in self.start_urls: yield SplashRequest( url=url, callback = self.parse, endpoint='execute', args={ 'lua_source': self.script } ) def parse(self, response): print('from parse') item = {} item['status'] = 'Hello world' return item @wait_for(timeout=50.0) async def run_spider(): """Returns all the scraped items of the provided publication number""" results = [] def crawler_results(signal, sender, item, response, spider): results.append(item) dispatcher.connect(crawler_results, signal=signals.item_scraped) runner = CrawlerRunner(settings={ 'BOT_NAME': 'web_page_crawler', 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', 'ROBOTSTXT_OBEY': False, 'SPLASH_URL': 'http://192.168.59.103:8050', 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage' }) await runner.crawl(GooglePatentsSpider, start_urls=[f'https://patents.google.com/?q=CL%3dgenistein']) if results: return results[0] else: return 'This publication number cannot be retrieved' run_spider()
The full traceback:
Traceback (most recent call last): File "hits_scraper.py", line 89, in <module> run_spider() File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 461, in wrapper return eventual_result.wait(timeout) File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 196, in wait result = self._result(timeout) File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 175, in _result raise TimeoutError() crochet._eventloop.TimeoutError
Advertisement
Answer
I got the same error when I did’t start splash
befor running code.
If I run splash
(as docker
image) then I also get this error because it had different IP
but if I use correct IP
in 'SPLASH_URL'
then it works.
On Linux I got IP
of running image using
docker inspect --format '{{ .NetworkSettings.IPAddress }}' $(docker ps -q)
but it seems code works also with universal IP 0.0.0.0
'SPLASH_URL': 'http://0.0.0.0:8050'