I am trying to scrape shopee.co.id using beautifulsoup and selenium. There are 60 product in a single search results page. At the end of the code, I checked the extracted data using len() and it shows that I only extracted 42 of them. How should I fix the code to obtain all the search results?
Here is the code that I’ve been trying:
import imp from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options # to customize chrome display from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from time import sleep from collections import Counter import threading import time import pandas as pd import numpy as np from numpy import nan import re import concurrent.futures import csv # Link product search result from turtle import delay url = 'https://shopee.co.id/search?keyword=obat%20kanker' path = '/Applications/chromedriver' # create object for chrome options chrome_options = Options() # Customize chrome display chrome_options.add_argument('start-maximized') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--headless') chrome_options.add_argument('disable-notifications') # To disable the message, "Chrome is being controlled by automated test software" chrome_options.add_argument('--disable-infobars') # create webdriver object driver = webdriver.Chrome(executable_path=path, options=chrome_options) driver.get(url) # get url main_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0' driver.get(main_link) WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item"))) driver.execute_script(""" var scroll = document.body.scrollHeight / 10; var i = 0; function scrollit(i) { window.scrollBy({top: scroll, left: 0, behavior: 'smooth'}); i++; if (i < 10) { setTimeout(scrollit, 500, i); } } scrollit(i); """) sleep(5) html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(html, "html.parser") # Scrape product name product_name = soup.find_all('div', class_="ie3A+n bM+7UW Cve6sh") product_name[0].get_text() product_price = soup.find_all('span', {'class': 'ZEgDH9'}) product_price[0].get_text() product_sold = soup.find_all('div', {'class':"r6HknA uEPGHT"}) product_sold[0].get_text() len(product_name)
Advertisement
Answer
This is one way you can get those product details (selenium setup is chrome/linux, you can adapt the code to your own setup, just see the imports and the code after defining the browser):
from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time as t import json chrome_options = Options() chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('disable-notifications') chrome_options.add_argument("window-size=1280,720") webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary browser = webdriver.Chrome(service=webdriver_service, options=chrome_options) url = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0' browser.get(url) items = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'script[data-rh="true"]'))) print(len(items)) for i in items: json_obj = json.loads(i.get_attribute('innerHTML')) if json_obj['@type'] == 'Product': print(json_obj['name'], json_obj['offers']) print('_____________')
This will print out in terminal:
61 OBAT KANKER TUMOR MIOM KISTA KELENJAR POLIP LIPOM BENJOLAN SEMBUH TOTAL TANPA OPERASI {'@type': 'Offer', 'price': '184000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'} _____________ GRAVIDA BHARATA OBAT KANKER PAYUDARA AMPUH |KANKER GANAS HERBAL TERDAFTAR DBPOM MUI WARYANTO076 {'@type': 'Offer', 'price': '275000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'} _____________ Walatra Zedoril 7 Asli Obat Herbal Kanker Tumor Dan Segala Jenis Benjolan Aman Tanpa Efek Samping {'@type': 'Offer', 'price': '255000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'} _____________ PROMO PAKET SEMBUH OBAT TUMOR KANKER KISTA MIOM & KELENJAR TERLARIS, TERPERCAYA TERBUKTI &GARANSI {'@type': 'Offer', 'price': '349600.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'} _____________ Obat Herbal Kanker Payudara, Serviks, Hati, Usus, Prostat, Leukimia dan Paru Paru ORIGINAL 100% ASLI {'@type': 'Offer', 'price': '525000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'} [...]
You can dissect those json objects further, to extract the data you need.