I’m trying to scrape some data from flight search page.
This page works this way:
You fill in a form and then you click on the button search – this is ok. When you click the button you are redirected to the page with results and here is the problem. This page is adding continuously results for example for one minute which is not a big deal – problem is to get all of these results. When you are in real browser, you have to scroll down the page and these results are appearing. So I’ve tried to scroll down using Selenium. It scrolls down at the bottom of the page probably so fast or it is a jump instead of scrolling that the page doesn’t load any new results.
When you are scrolling down slowly, it reloads results but if you do it very quickly it stops loading.
I’m not sure if my code helps to understand that so I’m attaching it.
SEARCH_STRING = """URL""" class spider(): def __init__(self): self.driver = webdriver.Firefox() @staticmethod def prepare_get(dep_airport,arr_airport,dep_date,arr_date): string = SEARCH_STRING%(dep_airport,arr_airport,arr_airport,dep_airport,dep_date,arr_date) return string def find_flights_html(self,dep_airport, arr_airport, dep_date, arr_date): if isinstance(dep_airport, list): airports_string = str(r'%20').join(dep_airport) dep_airport = airports_string wait = WebDriverWait(self.driver, 60) # wait for results self.driver.get(spider.prepare_get(dep_airport, arr_airport, dep_date, arr_date)) wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]'))) wait.until(EC.invisibility_of_element_located((By.XPATH, u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img'))) self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") self.driver.find_element_by_xpath('//body').send_keys(Keys.CONTROL+Keys.END) return self.driver.page_source @staticmethod def get_info_from_borderbox(div): arrival = div.find('div',class_='departure').text price = div.find('div',class_='pricebox').find('div',class_=re.compile('price')) departure = div.find_all('div',class_='departure')[1].contents date_departure = departure[1].text airport_departure = departure[5].text arrival = div.find_all('div', class_= 'arrival')[0].contents date_arrival = arrival[1].text airport_arrival = arrival[3].text[1:] print 'DEPARTURE: ' print date_departure,airport_departure print 'ARRIVAL: ' print date_arrival,airport_arrival @staticmethod def get_flights_from_result_page(html): def match_tag(tag, classes): return (tag.name == 'div' and 'class' in tag.attrs and all([c in tag['class'] for c in classes])) soup = mLib.getSoup_html(html) divs = soup.find_all(lambda t: match_tag(t, ['borderbox', 'flightbox', 'p2'])) for div in divs: spider.get_info_from_borderbox(div) print len(divs) spider_inst = spider() print spider.get_flights_from_result_page(spider_inst.find_flights_html(['BTS','BRU','PAR'], 'MAD', '2015-07-15', '2015-08-15'))
So the main problem is in my opinion that it scrolls too fast to trigger new loading of the results.
Have you any idea how to make it work?
Advertisement
Answer
Here is a different approach that worked for me involving scrolling into view of the last search result and waiting for additional elements to load before scrolling again:
# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import StaleElementReferenceException from selenium.webdriver.support import expected_conditions as EC class wait_for_more_than_n_elements(object): def __init__(self, locator, count): self.locator = locator self.count = count def __call__(self, driver): try: count = len(EC._find_elements(driver, self.locator)) return count >= self.count except StaleElementReferenceException: return False driver = webdriver.Firefox() dep_airport = ['BTS', 'BRU', 'PAR'] arr_airport = 'MAD' dep_date = '2015-07-15' arr_date = '2015-08-15' airports_string = str(r'%20').join(dep_airport) dep_airport = airports_string url = "https://www.pelikan.sk/sk/flights/list?dfc=C%s&dtc=C%s&rfc=C%s&rtc=C%s&dd=%s&rd=%s&px=1000&ns=0&prc=&rng=1&rbd=0&ct=0" % (dep_airport, arr_airport, arr_airport, dep_airport, dep_date, arr_date) driver.maximize_window() driver.get(url) wait = WebDriverWait(driver, 60) wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[contains(@src, "loading")]'))) wait.until(EC.invisibility_of_element_located((By.XPATH, u'//div[. = "Poprosíme o trpezlivosť, hľadáme pre Vás ešte viac letov"]/preceding-sibling::img'))) while True: # TODO: make the endless loop end results = driver.find_elements_by_css_selector("div.flightbox") print "Results count: %d" % len(results) # scroll to the last element driver.execute_script("arguments[0].scrollIntoView();", results[-1]) # wait for more results to load wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, 'div.flightbox'), len(results)))
Notes:
- you would need to figure out when to stop the loop – for example, at a particular
len(results)
value wait_for_more_than_n_elements
is a custom Expected Condition which helps to identify when the next portion is loaded and we can scroll again