Skip to content
Advertisement

Scrapy can’t find items

I am currently still learning Scrapy and trying to work with pipelines and ItemLoader.

However, I currently have the problem that the spider shows that Item.py does not exist. What exactly am I doing wrong and why am I not getting any data from the spider into my pipeline?

Running the Spider without importing the items works fine. The Pipeline is also activated in settings.py.

My Error Log is the following:

Traceback (most recent call last):
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310Scriptsscrapy.exe__main__.py", line 7, in <module> 
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycmdline.py", line 144, in execute
    cmd.crawler_process = CrawlerProcess(settings)
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 280, in __init__
    super().__init__(settings)
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 152, in __init__
    self.spider_loader = self._get_spider_loader(settings)
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 146, in _get_spider_loader
    return loader_cls.from_settings(settings.frozencopy())
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 67, in from_settings
    return cls(settings)
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 24, in __init__
    self._load_all_spiders()
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 51, in _load_all_spiders
    for module in walk_modules(name):
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyutilsmisc.py", line 88, in 
walk_modules
    submod = import_module(fullpath)
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "C:UsersSyrixWebCrawlerwatcheswatchesspiderswatchbot.py", line 5, in <module>
    from watches.watches.items import WatchesItem
ModuleNotFoundError: No module named 'watches.watches'

My Spider looks like that:

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from watches.watches.items import WatchesItem
from scrapy.exceptions import DropItem


class WatchbotSpider(scrapy.Spider):
    name = 'watchbot'
    start_urls = ['https://www.watch.de/english/rolex.html']


    def parse(self, response, **kwargs):
        for link in response.css('div.product-item-link a::attr(href)'):
            url = link.get()
            yield scrapy.Request(url, callback=self.parse_categories)


    def parse_categories(self, response):
        for product in response.xpath('//*[@id="main"]/div[2]/div[1]'):
            l = ItemLoader(item=WatchesItem(), selector=product)
            l.add_xpath('name', '//span[@itemprop="sku"]/text()')
            l.add_xpath('reference', '//span[@itemprop="sku"]/text()')
            l.add_xpath('year', '//div[@class="product-option baujahr"]/div[@class="product-option-value"]/text()')

            yield l.load_item()

items.py:

import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags


class WatchesItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
    reference = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
    year = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))

and last but not least my pipeline:

import mysql
import mysql.connector
from watches.watches.spiders import watchbot

class WatchesPipeline(object):

    def __init__(self):
        self.conn = mysql.connector.connect(
            host = '',
            user = '',
            passwd = '',
            database = ''
        )
        self.curr = self.conn.cursor()


    def process_item(self, item, spider):
        self.store_db(item)
        print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
        return item

    def store_db(self, item):
        self.curr.execute("""insert into test.watch values (%s, %s, %s)""", (
            item['name'][0],
            item['reference'][0],
            item['year'][0],
        ))
        self.conn.commit()

Edit:

PS E:semesterwebcrawler_watcheswatchesCrawler> scrapy crawl watchbot
Traceback (most recent call last):
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "E:semesterwebcrawler_watchesvenvScriptsscrapy.exe__main__.py", line 7, in <module>
  File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapycmdline.py", line 114, in execute
    settings = get_project_settings()
  File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapyutilsproject.py", line 68, in get_project_settings
    settings.setmodule(settings_module_path, priority='project')
  File "E:semester6. semesterwebcrawler_watchesvenvlibsite-packagesscrapysettings__init__.py", line 287, in setmodule
    module = import_module(module)
  File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1004, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'watches'
PS E:semesterwebcrawler_watcheswatchesCrawler>

Advertisement

Answer

Its work from me. Please follow this.

import mysql
import mysql.connector
# from watches.watches.spiders import watchbot

class WatchesPipeline(object):

    def __init__(self):
        self.conn = mysql.connector.connect(
            host = 'localhost', 
            user = 'root',
            passwd = '', # your password
            database = '', # your databse
           
        )
        self.curr = self.conn.cursor()


    # def create_table(self):
    #     self.curr.execute("""DROP TABLE IF EXISTS scrapy_tb """)
    #     self.curr.execute("""create table scrapy_tb (name text, reference text, year text)""")


    def process_item(self, item, spider):
        self.store_db(item)
        # print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
        return item

    def store_db(self, item):
        self.curr.execute("""insert into scrapy_tb values (%s, %s, %s)""", 
            (
            item['name'][0],
            item['reference'][0],
            item['year'][0]
        ))
        self.conn.commit()
        return item
        self.conn.close()
User contributions licensed under: CC BY-SA
6 People found this is helpful
Advertisement