Skip to content
Advertisement

Incomplete scrapped data from shopee.co.id using BeautifulSoup and Selenium

I am trying to scrape shopee.co.id using beautifulsoup and selenium. There are 60 product in a single search results page. At the end of the code, I checked the extracted data using len() and it shows that I only extracted 42 of them. How should I fix the code to obtain all the search results?

Here is the code that I’ve been trying:

import imp
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options       # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from time import sleep
from collections import Counter

import threading
import time
import pandas as pd
import numpy as np
from numpy import nan
import re
import concurrent.futures


import csv

# Link product search result
from turtle import delay
url = 'https://shopee.co.id/search?keyword=obat%20kanker'
path = '/Applications/chromedriver'

# create object for chrome options
chrome_options = Options()

# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')

# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument('--disable-infobars')   

# create webdriver object
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
driver.get(url)

# get url
main_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0' 
driver.get(main_link)
WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))

driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
    window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
    i++;
    if (i < 10) {
    setTimeout(scrollit, 500, i);
    }
}
scrollit(i);
""")
sleep(5)

html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")

# Scrape product name

product_name = soup.find_all('div', class_="ie3A+n bM+7UW Cve6sh")
product_name[0].get_text()

product_price = soup.find_all('span', {'class': 'ZEgDH9'})
product_price[0].get_text()

product_sold = soup.find_all('div', {'class':"r6HknA uEPGHT"})
product_sold[0].get_text()
    
len(product_name)

Advertisement

Answer

This is one way you can get those product details (selenium setup is chrome/linux, you can adapt the code to your own setup, just see the imports and the code after defining the browser):

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import json

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")

webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

url = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0' 
browser.get(url)
items = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'script[data-rh="true"]'))) 
print(len(items))
for i in items:
    json_obj = json.loads(i.get_attribute('innerHTML'))
    if json_obj['@type'] == 'Product':
        print(json_obj['name'], json_obj['offers'])
        print('_____________')

This will print out in terminal:

61
OBAT KANKER TUMOR MIOM KISTA KELENJAR  POLIP LIPOM BENJOLAN SEMBUH TOTAL TANPA OPERASI {'@type': 'Offer', 'price': '184000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
GRAVIDA BHARATA OBAT KANKER PAYUDARA AMPUH |KANKER GANAS HERBAL TERDAFTAR DBPOM MUI WARYANTO076 {'@type': 'Offer', 'price': '275000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
Walatra Zedoril 7 Asli Obat Herbal Kanker Tumor Dan Segala Jenis Benjolan Aman Tanpa Efek Samping {'@type': 'Offer', 'price': '255000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
PROMO PAKET SEMBUH OBAT TUMOR KANKER KISTA MIOM & KELENJAR TERLARIS, TERPERCAYA TERBUKTI &GARANSI {'@type': 'Offer', 'price': '349600.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
Obat Herbal Kanker Payudara, Serviks, Hati, Usus, Prostat, Leukimia dan Paru Paru ORIGINAL 100% ASLI {'@type': 'Offer', 'price': '525000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
[...]

You can dissect those json objects further, to extract the data you need.

Advertisement