I’m trying to scrape the reviews from this link:
For what I’m using the following code to load the page
from selenium import webdriver import datetime import time import argparse import os import time #Define the argument parser to read in the URL url = "https://www.google.com/search?q=google+reviews+2nd+chance+treatment+40th+street&rlz=1C1JZAP_enUS697US697&oq=google+reviews+2nd+chance+treatment+40th+street&aqs=chrome..69i57j69i64.6183j0j7&sourceid=chrome&ie=UTF-8#lrd=0x872b7179b68e33d5:0x24b5517d86a95f89,1" # Initialize the Chrome webdriver and open the URL #driver = webdriver.Chromium() profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko") #driver = webdriver.Firefox(profile) # https://stackoverflow.com/questions/22476112/using-chromedriver-with-selenium-python-ubuntu driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver") driver.get(url) driver.implicitly_wait(2) SCROLL_PAUSE_TIME = 0.5 # Get scroll height last_height = driver.execute_script("return document.body.scrollHeight") while True: # Scroll down to bottom driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height
The page load fine, it is not scrolling down, I have used the same code for other sites like linkedn and it works there.
Advertisement
Answer
Here is the logic that you can use without using the javascript scroll down. Simple and effective by using the location_once_scrolled_into_view
method which will scroll to the element.
As part of logic below, we are scrolling to the last review, and then checking if we loaded the number of reviews desired as per the request.
Imports Needed:
from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait
change the desiredReviewsCount
variable value as per your requirement in the below code.
wait = WebDriverWait(driver,10) url = "https://www.google.com/search?q=google+reviews+2nd+chance+treatment+40th+street&rlz=1C1JZAP_enUS697US697&oq=google+reviews+2nd+chance+treatment+40th+street&aqs=chrome..69i57j69i64.6183j0j7&sourceid=chrome&ie=UTF-8#lrd=0x872b7179b68e33d5:0x24b5517d86a95f89,1" driver.get(url) x=0 desiredReviewsCount=30 wait.until(EC.presence_of_all_elements_located((By.XPATH,"//div[@class='gws-localreviews__general-reviews-block']//div[@class='WMbnJf gws-localreviews__google-review']"))) while x<desiredReviewsCount: driver.find_element_by_xpath("(//div[@class='gws-localreviews__general-reviews-block']//div[@class='WMbnJf gws-localreviews__google-review'])[last()]").location_once_scrolled_into_view x = len(driver.find_elements_by_xpath("//div[@class='gws-localreviews__general-reviews-block']//div[@class='WMbnJf gws-localreviews__google-review']")) print (len(driver.find_elements_by_xpath("//div[@class='gws-localreviews__general-reviews-block']//div[@class='WMbnJf gws-localreviews__google-review']")))