I am currently still learning Scrapy and trying to work with pipelines and ItemLoader.
However, I currently have the problem that the spider shows that Item.py
does not exist. What exactly am I doing wrong and why am I not getting any data from the spider into my pipeline?
Running the Spider without importing the items works fine. The Pipeline is also activated in settings.py
.
My Error Log is the following:
Traceback (most recent call last): File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code exec(code, run_globals) File "C:UsersSyrixAppDataLocalProgramsPythonPython310Scriptsscrapy.exe__main__.py", line 7, in <module> File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycmdline.py", line 144, in execute cmd.crawler_process = CrawlerProcess(settings) File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 280, in __init__ super().__init__(settings) File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 152, in __init__ self.spider_loader = self._get_spider_loader(settings) File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 146, in _get_spider_loader return loader_cls.from_settings(settings.frozencopy()) File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 67, in from_settings return cls(settings) File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 24, in __init__ self._load_all_spiders() File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 51, in _load_all_spiders for module in walk_modules(name): File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyutilsmisc.py", line 88, in walk_modules submod = import_module(fullpath) File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module return _bootstrap._gcd_import(name[level:], package, level) File "<frozen importlib._bootstrap>", line 1050, in _gcd_import File "<frozen importlib._bootstrap>", line 1027, in _find_and_load File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 688, in _load_unlocked File "<frozen importlib._bootstrap_external>", line 883, in exec_module File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed File "C:UsersSyrixWebCrawlerwatcheswatchesspiderswatchbot.py", line 5, in <module> from watches.watches.items import WatchesItem ModuleNotFoundError: No module named 'watches.watches'
My Spider looks like that:
import scrapy from scrapy.crawler import CrawlerProcess from scrapy.loader import ItemLoader from watches.watches.items import WatchesItem from scrapy.exceptions import DropItem class WatchbotSpider(scrapy.Spider): name = 'watchbot' start_urls = ['https://www.watch.de/english/rolex.html'] def parse(self, response, **kwargs): for link in response.css('div.product-item-link a::attr(href)'): url = link.get() yield scrapy.Request(url, callback=self.parse_categories) def parse_categories(self, response): for product in response.xpath('//*[@id="main"]/div[2]/div[1]'): l = ItemLoader(item=WatchesItem(), selector=product) l.add_xpath('name', '//span[@itemprop="sku"]/text()') l.add_xpath('reference', '//span[@itemprop="sku"]/text()') l.add_xpath('year', '//div[@class="product-option baujahr"]/div[@class="product-option-value"]/text()') yield l.load_item()
items.py:
import scrapy from scrapy.loader import ItemLoader from itemloaders.processors import TakeFirst, MapCompose from w3lib.html import remove_tags class WatchesItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst())) reference = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst())) year = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
and last but not least my pipeline:
import mysql import mysql.connector from watches.watches.spiders import watchbot class WatchesPipeline(object): def __init__(self): self.conn = mysql.connector.connect( host = '', user = '', passwd = '', database = '' ) self.curr = self.conn.cursor() def process_item(self, item, spider): self.store_db(item) print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year']) return item def store_db(self, item): self.curr.execute("""insert into test.watch values (%s, %s, %s)""", ( item['name'][0], item['reference'][0], item['year'][0], )) self.conn.commit()
Edit:
PS E:semesterwebcrawler_watcheswatchesCrawler> scrapy crawl watchbot Traceback (most recent call last): File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code exec(code, run_globals) File "E:semesterwebcrawler_watchesvenvScriptsscrapy.exe__main__.py", line 7, in <module> File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapycmdline.py", line 114, in execute settings = get_project_settings() File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapyutilsproject.py", line 68, in get_project_settings settings.setmodule(settings_module_path, priority='project') File "E:semester6. semesterwebcrawler_watchesvenvlibsite-packagesscrapysettings__init__.py", line 287, in setmodule module = import_module(module) File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module return _bootstrap._gcd_import(name[level:], package, level) File "<frozen importlib._bootstrap>", line 1050, in _gcd_import File "<frozen importlib._bootstrap>", line 1027, in _find_and_load File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed File "<frozen importlib._bootstrap>", line 1050, in _gcd_import File "<frozen importlib._bootstrap>", line 1027, in _find_and_load File "<frozen importlib._bootstrap>", line 1004, in _find_and_load_unlocked ModuleNotFoundError: No module named 'watches' PS E:semesterwebcrawler_watcheswatchesCrawler>
Advertisement
Answer
Its work from me. Please follow this.
import mysql import mysql.connector # from watches.watches.spiders import watchbot class WatchesPipeline(object): def __init__(self): self.conn = mysql.connector.connect( host = 'localhost', user = 'root', passwd = '', # your password database = '', # your databse ) self.curr = self.conn.cursor() # def create_table(self): # self.curr.execute("""DROP TABLE IF EXISTS scrapy_tb """) # self.curr.execute("""create table scrapy_tb (name text, reference text, year text)""") def process_item(self, item, spider): self.store_db(item) # print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year']) return item def store_db(self, item): self.curr.execute("""insert into scrapy_tb values (%s, %s, %s)""", ( item['name'][0], item['reference'][0], item['year'][0] )) self.conn.commit() return item self.conn.close()