I am currently still learning Scrapy and trying to work with pipelines and ItemLoader.
However, I currently have the problem that the spider shows that Item.py does not exist. What exactly am I doing wrong and why am I not getting any data from the spider into my pipeline?
Running the Spider without importing the items works fine. The Pipeline is also activated in settings.py.
My Error Log is the following:
Traceback (most recent call last):
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code
exec(code, run_globals)
File "C:UsersSyrixAppDataLocalProgramsPythonPython310Scriptsscrapy.exe__main__.py", line 7, in <module>
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycmdline.py", line 144, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 280, in __init__
super().__init__(settings)
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 152, in __init__
self.spider_loader = self._get_spider_loader(settings)
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 146, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 67, in from_settings
return cls(settings)
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 24, in __init__
self._load_all_spiders()
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 51, in _load_all_spiders
for module in walk_modules(name):
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyutilsmisc.py", line 88, in
walk_modules
submod = import_module(fullpath)
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "C:UsersSyrixWebCrawlerwatcheswatchesspiderswatchbot.py", line 5, in <module>
from watches.watches.items import WatchesItem
ModuleNotFoundError: No module named 'watches.watches'
My Spider looks like that:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from watches.watches.items import WatchesItem
from scrapy.exceptions import DropItem
class WatchbotSpider(scrapy.Spider):
name = 'watchbot'
start_urls = ['https://www.watch.de/english/rolex.html']
def parse(self, response, **kwargs):
for link in response.css('div.product-item-link a::attr(href)'):
url = link.get()
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
for product in response.xpath('//*[@id="main"]/div[2]/div[1]'):
l = ItemLoader(item=WatchesItem(), selector=product)
l.add_xpath('name', '//span[@itemprop="sku"]/text()')
l.add_xpath('reference', '//span[@itemprop="sku"]/text()')
l.add_xpath('year', '//div[@class="product-option baujahr"]/div[@class="product-option-value"]/text()')
yield l.load_item()
items.py:
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
class WatchesItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
reference = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
year = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
and last but not least my pipeline:
import mysql
import mysql.connector
from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(
host = '',
user = '',
passwd = '',
database = ''
)
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
return item
def store_db(self, item):
self.curr.execute("""insert into test.watch values (%s, %s, %s)""", (
item['name'][0],
item['reference'][0],
item['year'][0],
))
self.conn.commit()
Edit:
PS E:semesterwebcrawler_watcheswatchesCrawler> scrapy crawl watchbot
Traceback (most recent call last):
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code
exec(code, run_globals)
File "E:semesterwebcrawler_watchesvenvScriptsscrapy.exe__main__.py", line 7, in <module>
File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapycmdline.py", line 114, in execute
settings = get_project_settings()
File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapyutilsproject.py", line 68, in get_project_settings
settings.setmodule(settings_module_path, priority='project')
File "E:semester6. semesterwebcrawler_watchesvenvlibsite-packagesscrapysettings__init__.py", line 287, in setmodule
module = import_module(module)
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1004, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'watches'
PS E:semesterwebcrawler_watcheswatchesCrawler>
Advertisement
Answer
Its work from me. Please follow this.
import mysql
import mysql.connector
# from watches.watches.spiders import watchbot
class WatchesPipeline(object):
def __init__(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
passwd = '', # your password
database = '', # your databse
)
self.curr = self.conn.cursor()
# def create_table(self):
# self.curr.execute("""DROP TABLE IF EXISTS scrapy_tb """)
# self.curr.execute("""create table scrapy_tb (name text, reference text, year text)""")
def process_item(self, item, spider):
self.store_db(item)
# print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
return item
def store_db(self, item):
self.curr.execute("""insert into scrapy_tb values (%s, %s, %s)""",
(
item['name'][0],
item['reference'][0],
item['year'][0]
))
self.conn.commit()
return item
self.conn.close()