Skip to content
Advertisement

scrape data using selenium

The program is running good but they will scrape only one TITLE I want they scrape all the title in the page These is the page link https://www.eurobike.com/en/index-exhibitors/exhibitors/?

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.eurobike.com/en/index-exhibitors/exhibitors/?'
        driver.get(URL)
        time.sleep(3)

        # opt #1 visit first link, print the title uncomment to see
        # click the single link
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.card-exhibitor"))).click()
        time.sleep(2)
        
        # parse the h1 tag text
        title = driver.find_element(By.CSS_SELECTOR, 'h1.underlined').text
        print(title)

        driver.quit()


supplyvan_scraper()

Advertisement

Answer

The website is populated completely by complex JavaScript.First of all, to display listing from this url,accepting the cookies is a must but to accept and click on the cookie button isn’t a easy task because cookies button is under shadow root (open) selenium and webdriverWait can do nothing on shadow root,so to execute shadow root you need to apply JavaScript querySelector.

Full Working code:

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup


options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
#chrome to stay open to see what's happening in the real word or make it comment to close
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)   
URL ='https://www.eurobike.com/en/index-exhibitors/exhibitors/?'
driver.get(URL)
time.sleep(5)
#To execute shadow root and accept cookies
driver.execute_script('''return document.querySelector('div#usercentrics-root').shadowRoot.querySelector('button[data-testid="uc-accept-all-button"]')''').click()
#Grabbing all listing url and iterate,append and new deriver request
links=[]
for card in WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.list__results > div > div > a'))):

    link=card.get_attribute('href')
    links.append(link)
for u in links:
    driver.get(u)
    time.sleep(5)
    #extracting desired data using bs4 to  avoid much uses  of selenium because of it's complexity and time killing
    soup = BeautifulSoup(driver.page_source,'lxml')
    title=soup.select_one('h1.underlined').get_text(strip=True)
    print(title)

Output:

 ANGLE is
A&C Solutions
A&J International Co.,Ltd
                        (Taiwan Branch)
A-Pro Tech Co., LTD
A-Rim Ent. Co., Ltd.
Abbey Bike Tools
ABIMOTA
                        Associacao Nacional das Industrias 
                        de Duas Rodas, Ferragens, Mobiliári
ABIMOTA
                        Associacao Nacional das Industrias 
                        de Duas Rodas, Ferragens, Mobiliári
ABUS |August Bremicker Söhne KG
ABUS |August Bremicker Söhne KG
Accelerated Systems Inc. (ASI)
ACCORD ENTERPRISE CORP.
Acer Gadget Inc.
Acetrikes Industrial Co., Ltd.
ACT LAB LLC
ACTIA
Action Sports SRL
Activent 365 s.r.o.
ADAC e.V.
ADD-ONE
AddBike
AddRE-Mo
                        (Electric Bike Solutions GmbH)
ADFC e. V.
Adhestick Innovations Ltd. (Joe's No Flats)
ADViTEX GMBH
Äike
AER Electric Company Ltd
                        King Edward House
Aero Sensor Ltd
Aeroe Limited
Aforge Enterprise Co., Ltd
Agentura REPRO spol. s r.o.
Advertisement