Why is Scrapy not following all rules / running all callbacks?

I have two spiders inheriting from a parent spider class as follows:

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess


class SpiderOpTest(CrawlSpider):

    custom_settings = {
        "USER_AGENT": "*",
        "LOG_LEVEL": "WARNING",
        "DOWNLOADER_MIDDLEWARES": {'scraper_scrapy.odds.middlewares.SeleniumMiddleware': 543},
    }
    httperror_allowed_codes = [301]
        
    def parse_tournament(self, response):
        print(f"Parsing tournament - {response.url}")

    def parse_tournament_page(self, response):
        print(f"Parsing tournament page - {response.url}")


class SpiderOpTest1(SpiderOpTest):

    name = "test_1"
    start_urls = ["https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/"]

    rules = (Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),)


class SpiderOpTest2(SpiderOpTest):

    name = "test_2"
    start_urls = ["https://www.oddsportal.com/tennis/results/"]

    rules = (
        Rule(LinkExtractor(allow="/atp-buenos-aires/results/"), callback="parse_tournament", follow=True),
        Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),
    )

process = CrawlerProcess()
process.crawl(<spider_class>)
process.start()

JavaScript
​x
 
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
​
​
class SpiderOpTest(CrawlSpider):
​
    custom_settings = {
        "USER_AGENT": "*",
        "LOG_LEVEL": "WARNING",
        "DOWNLOADER_MIDDLEWARES": {'scraper_scrapy.odds.middlewares.SeleniumMiddleware': 543},
    }
    httperror_allowed_codes = [301]
        
    def parse_tournament(self, response):
        print(f"Parsing tournament - {response.url}")
​
    def parse_tournament_page(self, response):
        print(f"Parsing tournament page - {response.url}")
​
​
class SpiderOpTest1(SpiderOpTest):
​
    name = "test_1"
    start_urls = ["https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/"]
​
    rules = (Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),)
​
​
class SpiderOpTest2(SpiderOpTest):
​
    name = "test_2"
    start_urls = ["https://www.oddsportal.com/tennis/results/"]
​
    rules = (
        Rule(LinkExtractor(allow="/atp-buenos-aires/results/"), callback="parse_tournament", follow=True),
        Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),
    )
​
process = CrawlerProcess()
process.crawl(<spider_class>)
process.start()
​

The parse_tournament_page callback for the Rule in first spider works fine.

However, the second spider only runs the parse_tournament callback from the first Rule despite the fact that the second Rule is the same as the first spider and is operating on the same page.

I’m clearly missing something really simple but for the life of me I can’t figure out what it is…

As key bits of the pages load via Javascript then it might be useful for me to include the Selenium middleware I’m using:

from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver


class SeleniumMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        middleware = cls()
        crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
        return middleware

    def process_request(self, request, spider):
        self.driver.get(request.url)
        return HtmlResponse(
            self.driver.current_url,
            body=self.driver.page_source,
            encoding='utf-8',
            request=request,
        )

    def spider_opened(self, spider):
        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")
        self.driver = webdriver.Firefox(options=options)

    def spider_closed(self, spider):
        self.driver.close()

JavaScript
 
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver
​
​
class SeleniumMiddleware:
​
    @classmethod
    def from_crawler(cls, crawler):
        middleware = cls()
        crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
        return middleware
​
    def process_request(self, request, spider):
        self.driver.get(request.url)
        return HtmlResponse(
            self.driver.current_url,
            body=self.driver.page_source,
            encoding='utf-8',
            request=request,
        )
​
    def spider_opened(self, spider):
        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")
        self.driver = webdriver.Firefox(options=options)
​
    def spider_closed(self, spider):
        self.driver.close()
​

Edit:

So I’ve managed to create a third spider which is able to execute the parse_tournament_page callback from inside parse_tournament:

class SpiderOpTest3(SpiderOpTest):
    
    name = "test_3"
    start_urls = ["https://www.oddsportal.com/tennis/results/"]
    httperror_allowed_codes = [301]
    
    rules = (
        Rule(
            LinkExtractor(allow="/atp-buenos-aires/results/"),
            callback="parse_tournament",
            follow=True,
        ),
    )

    def parse_tournament(self, response):
        print(f"Parsing tournament - {response.url}")
        xtr = LinkExtractor(allow="/page/")
        links = xtr.extract_links(response)
        for p in links:
            yield response.follow(p.url, dont_filter=True, callback=self.parse_tournament_page)

    def parse_tournament_page(self, response):
        print(f"Parsing tournament PAGE - {response.url}")

JavaScript
 
​
class SpiderOpTest3(SpiderOpTest):
    
    name = "test_3"
    start_urls = ["https://www.oddsportal.com/tennis/results/"]
    httperror_allowed_codes = [301]
    
    rules = (
        Rule(
            LinkExtractor(allow="/atp-buenos-aires/results/"),
            callback="parse_tournament",
            follow=True,
        ),
    )
​
    def parse_tournament(self, response):
        print(f"Parsing tournament - {response.url}")
        xtr = LinkExtractor(allow="/page/")
        links = xtr.extract_links(response)
        for p in links:
            yield response.follow(p.url, dont_filter=True, callback=self.parse_tournament_page)
​
    def parse_tournament_page(self, response):
        print(f"Parsing tournament PAGE - {response.url}")
​

The key here seems to be dont_filter=True – if this is left as the default False then the parse_tournament_page callback isn’t executed. This suggests Scrapy is somehow interpreting the second page as a duplicate which I far as I can tell it isn’t. That aside, from what I’ve read if I want to get around this then I need to add unique=False to the LinkExtractor. However, doing this doesn’t result in the parse_tournament_page callback executing :(

Update:

So I think I’ve found the source of the issue. From what I can tell the request_fingerprint method of RFPDupeFilter creates the same hash for https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/ as https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/#/page/2/.

From reading around I need to subclass RFPDupeFilter to reconfigure the way request_fingerprint works. Any advice on why the same hashes are being generated and/or tips on how to do subclass correctly would be greatly appreciated!

Answer

The difference between the two URLs mentioned in the update is in the fragment #/page/2/. Scrapy ignores them by default: Also, servers usually ignore fragments in urls when handling requests, so they are also ignored by default when calculating the fingerprint. If you want to include them, set the keep_fragments argument to True (for instance when handling requests with a headless browser). (from scrapy/utils/request.py)

Check DUPEFILTER_CLASS settings for more information.

The request_fingerprint from scrapy.utils.request can already handle the fragments. When subclassing pass keep_fragments=True.

Add the your class in the custom_settings of SpiderOpTest.

Advertisement

Answer