I am currently still learning Scrapy and trying to work with pipelines and ItemLoader.
However, I currently have the problem that the spider shows that Item.py
does not exist. What exactly am I doing wrong and why am I not getting any data from the spider into my pipeline?
Running the Spider without importing the items works fine. The Pipeline is also activated in settings.py
.
My Error Log is the following:
JavaScript
x
35
35
1
Traceback (most recent call last):
2
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main
3
return _run_code(code, main_globals, None,
4
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code
5
exec(code, run_globals)
6
File "C:UsersSyrixAppDataLocalProgramsPythonPython310Scriptsscrapy.exe__main__.py", line 7, in <module>
7
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycmdline.py", line 144, in execute
8
cmd.crawler_process = CrawlerProcess(settings)
9
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 280, in __init__
10
super().__init__(settings)
11
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 152, in __init__
12
self.spider_loader = self._get_spider_loader(settings)
13
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapycrawler.py", line 146, in _get_spider_loader
14
return loader_cls.from_settings(settings.frozencopy())
15
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 67, in from_settings
16
return cls(settings)
17
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 24, in __init__
18
self._load_all_spiders()
19
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyspiderloader.py", line 51, in _load_all_spiders
20
for module in walk_modules(name):
21
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libsite-packagesscrapyutilsmisc.py", line 88, in
22
walk_modules
23
submod = import_module(fullpath)
24
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module
25
return _bootstrap._gcd_import(name[level:], package, level)
26
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
27
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
28
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
29
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
30
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
31
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
32
File "C:UsersSyrixWebCrawlerwatcheswatchesspiderswatchbot.py", line 5, in <module>
33
from watches.watches.items import WatchesItem
34
ModuleNotFoundError: No module named 'watches.watches'
35
My Spider looks like that:
JavaScript
1
27
27
1
import scrapy
2
from scrapy.crawler import CrawlerProcess
3
from scrapy.loader import ItemLoader
4
from watches.watches.items import WatchesItem
5
from scrapy.exceptions import DropItem
6
7
8
class WatchbotSpider(scrapy.Spider):
9
name = 'watchbot'
10
start_urls = ['https://www.watch.de/english/rolex.html']
11
12
13
def parse(self, response, **kwargs):
14
for link in response.css('div.product-item-link a::attr(href)'):
15
url = link.get()
16
yield scrapy.Request(url, callback=self.parse_categories)
17
18
19
def parse_categories(self, response):
20
for product in response.xpath('//*[@id="main"]/div[2]/div[1]'):
21
l = ItemLoader(item=WatchesItem(), selector=product)
22
l.add_xpath('name', '//span[@itemprop="sku"]/text()')
23
l.add_xpath('reference', '//span[@itemprop="sku"]/text()')
24
l.add_xpath('year', '//div[@class="product-option baujahr"]/div[@class="product-option-value"]/text()')
25
26
yield l.load_item()
27
items.py:
JavaScript
1
12
12
1
import scrapy
2
from scrapy.loader import ItemLoader
3
from itemloaders.processors import TakeFirst, MapCompose
4
from w3lib.html import remove_tags
5
6
7
class WatchesItem(scrapy.Item):
8
# define the fields for your item here like:
9
name = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
10
reference = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
11
year = scrapy.Field(input_processor = MapCompose(remove_tags, output_processor = TakeFirst()))
12
and last but not least my pipeline:
JavaScript
1
29
29
1
import mysql
2
import mysql.connector
3
from watches.watches.spiders import watchbot
4
5
class WatchesPipeline(object):
6
7
def __init__(self):
8
self.conn = mysql.connector.connect(
9
host = '',
10
user = '',
11
passwd = '',
12
database = ''
13
)
14
self.curr = self.conn.cursor()
15
16
17
def process_item(self, item, spider):
18
self.store_db(item)
19
print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
20
return item
21
22
def store_db(self, item):
23
self.curr.execute("""insert into test.watch values (%s, %s, %s)""", (
24
item['name'][0],
25
item['reference'][0],
26
item['year'][0],
27
))
28
self.conn.commit()
29
Edit:
JavaScript
1
25
25
1
PS E:semesterwebcrawler_watcheswatchesCrawler> scrapy crawl watchbot
2
Traceback (most recent call last):
3
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 196, in _run_module_as_main
4
return _run_code(code, main_globals, None,
5
File "C:UsersSyrixAppDataLocalProgramsPythonPython310librunpy.py", line 86, in _run_code
6
exec(code, run_globals)
7
File "E:semesterwebcrawler_watchesvenvScriptsscrapy.exe__main__.py", line 7, in <module>
8
File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapycmdline.py", line 114, in execute
9
settings = get_project_settings()
10
File "E:semesterwebcrawler_watchesvenvlibsite-packagesscrapyutilsproject.py", line 68, in get_project_settings
11
settings.setmodule(settings_module_path, priority='project')
12
File "E:semester6. semesterwebcrawler_watchesvenvlibsite-packagesscrapysettings__init__.py", line 287, in setmodule
13
module = import_module(module)
14
File "C:UsersSyrixAppDataLocalProgramsPythonPython310libimportlib__init__.py", line 126, in import_module
15
return _bootstrap._gcd_import(name[level:], package, level)
16
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
17
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
18
File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
19
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
20
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
21
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
22
File "<frozen importlib._bootstrap>", line 1004, in _find_and_load_unlocked
23
ModuleNotFoundError: No module named 'watches'
24
PS E:semesterwebcrawler_watcheswatchesCrawler>
25
Advertisement
Answer
Its work from me. Please follow this.
JavaScript
1
38
38
1
import mysql
2
import mysql.connector
3
# from watches.watches.spiders import watchbot
4
5
class WatchesPipeline(object):
6
7
def __init__(self):
8
self.conn = mysql.connector.connect(
9
host = 'localhost',
10
user = 'root',
11
passwd = '', # your password
12
database = '', # your databse
13
14
)
15
self.curr = self.conn.cursor()
16
17
18
# def create_table(self):
19
# self.curr.execute("""DROP TABLE IF EXISTS scrapy_tb """)
20
# self.curr.execute("""create table scrapy_tb (name text, reference text, year text)""")
21
22
23
def process_item(self, item, spider):
24
self.store_db(item)
25
# print("Pipleline = " + item['name'] + " " + item['reference'] + " " + item['year'])
26
return item
27
28
def store_db(self, item):
29
self.curr.execute("""insert into scrapy_tb values (%s, %s, %s)""",
30
(
31
item['name'][0],
32
item['reference'][0],
33
item['year'][0]
34
))
35
self.conn.commit()
36
return item
37
self.conn.close()
38