I am trying to scrape shopee.co.id using beautifulsoup and selenium. There are 60 product in a single search results page. At the end of the code, I checked the extracted data using len() and it shows that I only extracted 42 of them. How should I fix the code to obtain all the search results?
Here is the code that I’ve been trying:
JavaScript
x
78
78
1
import imp
2
from bs4 import BeautifulSoup
3
from selenium import webdriver
4
from selenium.webdriver.chrome.options import Options # to customize chrome display
5
from selenium.webdriver.support.ui import WebDriverWait
6
from selenium.common.exceptions import TimeoutException
7
from selenium.webdriver.common.by import By
8
from selenium.webdriver.support import expected_conditions as EC
9
from time import sleep
10
from collections import Counter
11
12
import threading
13
import time
14
import pandas as pd
15
import numpy as np
16
from numpy import nan
17
import re
18
import concurrent.futures
19
20
21
import csv
22
23
# Link product search result
24
from turtle import delay
25
url = 'https://shopee.co.id/search?keyword=obat%20kanker'
26
path = '/Applications/chromedriver'
27
28
# create object for chrome options
29
chrome_options = Options()
30
31
# Customize chrome display
32
chrome_options.add_argument('start-maximized')
33
chrome_options.add_argument('--no-sandbox')
34
chrome_options.add_argument('--headless')
35
chrome_options.add_argument('disable-notifications')
36
37
# To disable the message, "Chrome is being controlled by automated test software"
38
chrome_options.add_argument('--disable-infobars')
39
40
# create webdriver object
41
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
42
driver.get(url)
43
44
# get url
45
main_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0'
46
driver.get(main_link)
47
WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
48
49
driver.execute_script("""
50
var scroll = document.body.scrollHeight / 10;
51
var i = 0;
52
function scrollit(i) {
53
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
54
i++;
55
if (i < 10) {
56
setTimeout(scrollit, 500, i);
57
}
58
}
59
scrollit(i);
60
""")
61
sleep(5)
62
63
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
64
soup = BeautifulSoup(html, "html.parser")
65
66
# Scrape product name
67
68
product_name = soup.find_all('div', class_="ie3A+n bM+7UW Cve6sh")
69
product_name[0].get_text()
70
71
product_price = soup.find_all('span', {'class': 'ZEgDH9'})
72
product_price[0].get_text()
73
74
product_sold = soup.find_all('div', {'class':"r6HknA uEPGHT"})
75
product_sold[0].get_text()
76
77
len(product_name)
78
Advertisement
Answer
This is one way you can get those product details (selenium setup is chrome/linux, you can adapt the code to your own setup, just see the imports and the code after defining the browser):
JavaScript
1
27
27
1
from selenium import webdriver
2
from selenium.webdriver.chrome.service import Service
3
from selenium.webdriver.chrome.options import Options
4
from selenium.webdriver.common.by import By
5
from selenium.webdriver.support.ui import WebDriverWait
6
from selenium.webdriver.support import expected_conditions as EC
7
import time as t
8
import json
9
10
chrome_options = Options()
11
chrome_options.add_argument("--no-sandbox")
12
chrome_options.add_argument('disable-notifications')
13
chrome_options.add_argument("window-size=1280,720")
14
15
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
16
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
17
18
url = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0'
19
browser.get(url)
20
items = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'script[data-rh="true"]')))
21
print(len(items))
22
for i in items:
23
json_obj = json.loads(i.get_attribute('innerHTML'))
24
if json_obj['@type'] == 'Product':
25
print(json_obj['name'], json_obj['offers'])
26
print('_____________')
27
This will print out in terminal:
JavaScript
1
12
12
1
61
2
OBAT KANKER TUMOR MIOM KISTA KELENJAR POLIP LIPOM BENJOLAN SEMBUH TOTAL TANPA OPERASI {'@type': 'Offer', 'price': '184000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
3
_____________
4
GRAVIDA BHARATA OBAT KANKER PAYUDARA AMPUH |KANKER GANAS HERBAL TERDAFTAR DBPOM MUI WARYANTO076 {'@type': 'Offer', 'price': '275000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
5
_____________
6
Walatra Zedoril 7 Asli Obat Herbal Kanker Tumor Dan Segala Jenis Benjolan Aman Tanpa Efek Samping {'@type': 'Offer', 'price': '255000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
7
_____________
8
PROMO PAKET SEMBUH OBAT TUMOR KANKER KISTA MIOM & KELENJAR TERLARIS, TERPERCAYA TERBUKTI &GARANSI {'@type': 'Offer', 'price': '349600.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
9
_____________
10
Obat Herbal Kanker Payudara, Serviks, Hati, Usus, Prostat, Leukimia dan Paru Paru ORIGINAL 100% ASLI {'@type': 'Offer', 'price': '525000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
11
[ ]
12
You can dissect those json objects further, to extract the data you need.