Having an issue with XMLFeedSpider. I can get the parsing to work on the scrapy shell, so it seems there is something going on with either the request, or the spider’s engagement. Whether I add a start_request()
method or not, I seem to get the same error.
No output_file.csv
is produced after running the spider.
I am able to get a scrapy.Spider and CrawlSpider to work, but can’t seem to figure out what I am doing wrong with the XMLFeedSpider.
This is the spider:
JavaScript
x
41
41
1
from ..items import TheItem
2
from scrapy.loader import ItemLoader
3
import scrapy
4
from scrapy.crawler import CrawlerProcess
5
6
7
class TheSpider(scrapy.spiders.XMLFeedSpider):
8
name = 'stuff_spider'
9
allowed_domains = ['www.website.net']
10
start_urls = ['https://www.website.net/10016/stuff/otherstuff.xml']
11
namespaces = [('xsi', 'https://schemas.website.net/xml/uslm'), ]
12
itertag = 'xsi:item'
13
iterator = 'xml'
14
15
def start_requests(self):
16
yield scrapy.Request('https://www.website.net/10016/stuff/otherstuff.xml', callback=self.parse_node)
17
18
def parse_node(self, response, node):
19
l = ItemLoader(item=TheItem(), selector=node, response=response)
20
21
just_want_something = 'just want the csv to show some output'
22
23
l.add_xpath('title', response.xpath('//xsi:title/text()').extract())
24
l.add_xpath('date', response.xpath('//xsi:date/text()').extract())
25
l.add_xpath('category', node.xpath('//xsi:cat1/text()').extract())
26
l.add_value('content', node.xpath('//xsi:content/text()'))
27
l.add_value('manditory', just_want_something)
28
29
yield l.load_item()
30
31
32
process = CrawlerProcess(settings={
33
'FEEDS': 'output_file.csv',
34
'FEED_FORMAT': 'csv',
35
'DOWNLOAD_DELAY': 1.25,
36
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'
37
})
38
39
process.crawl(TheSpider)
40
process.start()
41
This is the item:
JavaScript
1
30
30
1
from scrapy import Item, Field
2
from itemloaders.processors import Identity, Compose
3
4
5
def all_lower(value):
6
return value.lower()
7
8
9
class TheItem(Item):
10
title = Field(
11
input_processor=Compose(all_lower),
12
output_processor=Identity()
13
)
14
link = Field(
15
input_processor=Compose(all_lower),
16
output_processor=Identity()
17
)
18
date = Field(
19
input_processor=Compose(all_lower),
20
output_processor=Identity()
21
)
22
category = Field(
23
input_processor=Compose(all_lower),
24
output_processor=Identity()
25
)
26
manditory = Field(
27
input_processor=Compose(all_lower),
28
output_processor=Identity()
29
)
30
This is the output:
JavaScript
1
30
30
1
D:GitFolderscrapyProjectsTheProjectvenvScriptspython.exe D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py
2
Traceback (most recent call last):
3
File "D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py", line 43, in <module>
4
process = CrawlerProcess(settings={
5
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 289, in __init__
6
super().__init__(settings)
7
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 164, in __init__
8
settings = Settings(settings)
9
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 454, in __init__
10
self.update(values, priority)
11
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 323, in update
12
self.set(name, value, priority)
13
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 265, in set
14
self.attributes[name].set(value, priority)
15
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 50, in set
16
value = BaseSettings(value, priority=priority)
17
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 86, in __init__
18
self.update(values, priority)
19
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 316, in update
20
values = json.loads(values)
21
File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjson__init__.py", line 346, in loads
22
return _default_decoder.decode(s)
23
File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 337, in decode
24
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
25
File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 355, in raw_decode
26
raise JSONDecodeError("Expecting value", s, err.value) from None
27
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
28
29
Process finished with exit code 1
30
And if I remove the start_requests()
method, I get this output:
JavaScript
1
30
30
1
D:GitFolderscrapyProjectsTheProjectvenvScriptspython.exe D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py
2
Traceback (most recent call last):
3
File "D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py", line 43, in <module>
4
process = CrawlerProcess(settings={
5
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 289, in __init__
6
super().__init__(settings)
7
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 164, in __init__
8
settings = Settings(settings)
9
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 454, in __init__
10
self.update(values, priority)
11
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 323, in update
12
self.set(name, value, priority)
13
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 265, in set
14
self.attributes[name].set(value, priority)
15
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 50, in set
16
value = BaseSettings(value, priority=priority)
17
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 86, in __init__
18
self.update(values, priority)
19
File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 316, in update
20
values = json.loads(values)
21
File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjson__init__.py", line 346, in loads
22
return _default_decoder.decode(s)
23
File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 337, in decode
24
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
25
File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 355, in raw_decode
26
raise JSONDecodeError("Expecting value", s, err.value) from None
27
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
28
29
Process finished with exit code 1
30
Both ultimately end up with the same error.
Advertisement
Answer
According to https://docs.scrapy.org/en/latest/topics/feed-exports.html#feeds FEED
param should be a dict
. Like:
JavaScript
1
6
1
process = CrawlerProcess(settings={
2
"FEEDS": {
3
"items.json": {"format": "json"},
4
},
5
})
6