Having an issue with XMLFeedSpider. I can get the parsing to work on the scrapy shell, so it seems there is something going on with either the request, or the spider’s engagement. Whether I add a start_request()
method or not, I seem to get the same error.
No output_file.csv
is produced after running the spider.
I am able to get a scrapy.Spider and CrawlSpider to work, but can’t seem to figure out what I am doing wrong with the XMLFeedSpider.
This is the spider:
from ..items import TheItem from scrapy.loader import ItemLoader import scrapy from scrapy.crawler import CrawlerProcess class TheSpider(scrapy.spiders.XMLFeedSpider): name = 'stuff_spider' allowed_domains = ['www.website.net'] start_urls = ['https://www.website.net/10016/stuff/otherstuff.xml'] namespaces = [('xsi', 'https://schemas.website.net/xml/uslm'), ] itertag = 'xsi:item' iterator = 'xml' def start_requests(self): yield scrapy.Request('https://www.website.net/10016/stuff/otherstuff.xml', callback=self.parse_node) def parse_node(self, response, node): l = ItemLoader(item=TheItem(), selector=node, response=response) just_want_something = 'just want the csv to show some output' l.add_xpath('title', response.xpath('//xsi:title/text()').extract()) l.add_xpath('date', response.xpath('//xsi:date/text()').extract()) l.add_xpath('category', node.xpath('//xsi:cat1/text()').extract()) l.add_value('content', node.xpath('//xsi:content/text()')) l.add_value('manditory', just_want_something) yield l.load_item() process = CrawlerProcess(settings={ 'FEEDS': 'output_file.csv', 'FEED_FORMAT': 'csv', 'DOWNLOAD_DELAY': 1.25, 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0' }) process.crawl(TheSpider) process.start()
This is the item:
from scrapy import Item, Field from itemloaders.processors import Identity, Compose def all_lower(value): return value.lower() class TheItem(Item): title = Field( input_processor=Compose(all_lower), output_processor=Identity() ) link = Field( input_processor=Compose(all_lower), output_processor=Identity() ) date = Field( input_processor=Compose(all_lower), output_processor=Identity() ) category = Field( input_processor=Compose(all_lower), output_processor=Identity() ) manditory = Field( input_processor=Compose(all_lower), output_processor=Identity() )
This is the output:
D:GitFolderscrapyProjectsTheProjectvenvScriptspython.exe D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py Traceback (most recent call last): File "D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py", line 43, in <module> process = CrawlerProcess(settings={ File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 289, in __init__ super().__init__(settings) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 164, in __init__ settings = Settings(settings) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 454, in __init__ self.update(values, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 323, in update self.set(name, value, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 265, in set self.attributes[name].set(value, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 50, in set value = BaseSettings(value, priority=priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 86, in __init__ self.update(values, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 316, in update values = json.loads(values) File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjson__init__.py", line 346, in loads return _default_decoder.decode(s) File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 337, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 355, in raw_decode raise JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) Process finished with exit code 1
And if I remove the start_requests()
method, I get this output:
D:GitFolderscrapyProjectsTheProjectvenvScriptspython.exe D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py Traceback (most recent call last): File "D:GitFolderscrapyProjectsTheProjectTheSpiderTheSpiderspidersTheSpider.py", line 43, in <module> process = CrawlerProcess(settings={ File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 289, in __init__ super().__init__(settings) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapycrawler.py", line 164, in __init__ settings = Settings(settings) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 454, in __init__ self.update(values, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 323, in update self.set(name, value, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 265, in set self.attributes[name].set(value, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 50, in set value = BaseSettings(value, priority=priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 86, in __init__ self.update(values, priority) File "D:GitFolderscrapyProjectsTheProjectvenvlibsite-packagesscrapysettings__init__.py", line 316, in update values = json.loads(values) File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjson__init__.py", line 346, in loads return _default_decoder.decode(s) File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 337, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0libjsondecoder.py", line 355, in raw_decode raise JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) Process finished with exit code 1
Both ultimately end up with the same error.
Advertisement
Answer
According to https://docs.scrapy.org/en/latest/topics/feed-exports.html#feeds FEED
param should be a dict
. Like:
process = CrawlerProcess(settings={ "FEEDS": { "items.json": {"format": "json"}, }, })