I try to scrape all the pages of a URL by selenium python, but only could get the values from the 1st page. The code is going to the next page but same code is not working to scrape ending with an error. The error is ” Element … is not clickable at point (208, 17). Other element would receive the click: …”. Here is the code
import pandas as pd from selenium import webdriver import time from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait as W from selenium.webdriver.support import expected_conditions as E def page_scrape(): driver.maximize_window() ADDRESS_LOCATIONS_TEASER = '//div[contains(@class,"all-stores accordian ng-star-inserted")]' ADDRESS_LOCATIONS = './/div[contains(@class,"accordian-header")]' teaser = driver.find_element(By.XPATH, ADDRESS_LOCATIONS_TEASER) locations = teaser.find_elements(By.XPATH, ADDRESS_LOCATIONS) for loc in locations: add = loc.find_element(By.XPATH, './/a[@href]').click() add1 = driver.find_element(By.XPATH, ".//address[contains(@class, 'address-block')]").text print(add1) Services_Types_TEASER = '//div[contains(@class, "store-accordian store-accordian-flex ng-star-inserted")]' Service_Types = './/div[contains(@class, "store-dine-flx ng-star-inserted")]' services_teaser = driver.find_element(By.XPATH, Services_Types_TEASER) services_list = services_teaser.find_elements(By.XPATH, Service_Types) types_of_services = '//div[contains(@id,"divrestaurant2")]' types_of_services_teaser = './/div[contains(@class,"services ng-star-inserted")]' types_of_services_find = driver.find_element(By.XPATH, types_of_services) types_of_service_list = types_of_services_find.find_elements(By.XPATH, types_of_services_teaser) text = [] for types in types_of_service_list: if types.text == 'Services in diesem Restaurant': text = "'{}':".format(types.text) elif types.text == 'Betreiber': exit else: text = "{0},'{1}'".format(text, types.text) print(text) for services_types in services_list: # service = driver.find_element(By.XPATH, './/h5').text print(services_types.text) driver = webdriver.Chrome("C:/Users/doyel/Downloads/chromedriver_win32/chromedriver.exe") driver.get('https://www.kfc.de/find-a-kfc') results = pd.DataFrame(columns=['address', 'PLZ', 'Telephone' 'Restaurant Services']) COOKIE_PATH = '//button[contains(@id,"onetrust-accept-btn-handler")]' driver.find_element(By.XPATH,COOKIE_PATH).click() flag = True while True: page_scrape() next_page = '//a[@aria-label="Next page"]' try: driver.find_element(By.XPATH, next_page).click() print("next page") time.sleep(2) except: print("last page reached") break #//p[@class='mb-2']//font//font[contains(text(),'Schnellerstr.')] #print(driver.page_source) driver.quit()
Advertisement
Answer
While not trivial, it’s doable, and here is one way to do it:
from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys import pandas as pd import time as t from tqdm import tqdm ## if using Jupyter notebook, import as from tqdm.notebook import tqdm pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', None) chrome_options = Options() chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('disable-notifications') chrome_options.add_argument("window-size=1920,1080") webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary driver = webdriver.Chrome(service=webdriver_service, options=chrome_options) wait = WebDriverWait(driver, 5) restaurant_list = [] driver.get('https://www.kfc.de/find-a-kfc') try: wait.until(EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))).click() print('dismissed cookies') except Exception as e: print('no cookie button!') header = wait.until(EC.element_to_be_clickable((By.TAG_NAME, "app-common-header"))) driver.execute_script(""" var element = arguments[0]; element.parentNode.removeChild(element); """, header) for x in tqdm(range(1, 21)): kfc_rests = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="all-stores accordian ng-star-inserted"]//app-accordian[@class="card carryout-address ng-star-inserted"]'))) for k in kfc_rests: k.location_once_scrolled_into_view k.click() name = k.find_element(By.TAG_NAME, 'strong').text address = wait.until(EC.element_to_be_clickable((By.XPATH, '//address[@class="address-block"]/p'))).text.replace('n', ' ').strip() try: services = ', '.join([x.text.strip() for x in wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="services ng-star-inserted"]//li')))]) except Exception as e: services = 'Not specified' restaurant_list.append((name, address, services)) try: next_page = wait.until(EC.element_to_be_clickable((By.XPATH, '//li[@class="pagination-next ng-star-inserted"]//a[@aria-label="Next page"]'))) next_page.location_once_scrolled_into_view next_page.click() except Exception as e: print('end of list') break df = pd.DataFrame(restaurant_list, columns = ['Name', 'Address', 'Services']) print(df)
Result in terminal:
dismissed cookies 95% 19/20 [01:41<00:05, 5.91s/it] end of list Name Address Services 0 KFC BERLIN Grenzallee 37 12057 Berlin Lieferung, Drive Thru, Free Refill, EC-Zahlung, Click & Collect 1 KFC BERLIN Gatower Straße 56 13595 Berlin Lieferung, Drive Thru, Free Refill, EC-Zahlung, Click & Collect 2 KFC BERLIN Mall of Berlin Leipziger Platz 12 10117 Berlin Lieferung, Free Refill, EC-Zahlung, Click & Collect 3 KFC BERLIN Klosterstraße 3 13581 Berlin Lieferung, EC-Zahlung, Click & Collect 4 KFC BERLIN Schnellerstr. 18a 12439 Berlin Drive Thru, Free Refill, EC-Zahlung, Click & Collect ... ... ... ... 191 KFC SAARBRÜCKEN Wolfseck 6 66130 Saarbrücken Drive Thru, Free Refill, EC-Zahlung 192 KFC SAARLOUIS Provinzialstr. 246 66740 Saarlouis Drive Thru, Free Refill, EC-Zahlung 193 KFC OFFENBURG Heinrich-Hertz-Str. 3 77656 Offenburg Drive Thru, Free Refill, EC-Zahlung 194 KFC FREIBURG Tullastraße 68 79108 Freiburg Lieferung, Drive Thru, Free Refill, EC-Zahlung 195 KFC FRANKFURT FLUGHAFEN Tullastraße 68 79108 Freiburg Lieferung, Drive Thru, Free Refill, EC-Zahlung 196 rows × 3 columns
Selenium documentation can be found at: https://www.selenium.dev/documentation/
Pandas documentation: https://pandas.pydata.org/docs/
And for TQDM, go to https://pypi.org/project/tqdm/