My script is kind of working but the files it saves are empty. Any ideas? Forgive me for all the unused import at the top! I tried a lot of different things to do this. In here I’m pulling the img using selenium. The SRCs are then iterated through a loop and transformed into bytes so that they can be written using os.path. I suspect the website is protecting itself against such scraping maybe?
from bs4 import BeautifulSoup from urllib.request import Request, urlopen import os import urllib import urllib3 import time from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium import webdriver from selenium.webdriver.firefox.options import Options import requests driver = webdriver.Firefox() options = Options() options.headless = True driver = webdriver.Firefox(options=options) driver.get("https://superrare.com/features/the-intersection-of-machine-and-artist") time.sleep(2) #the element with longest height on page ele=driver.find_element("xpath", '//div[@id="root"]') total_height = ele.size["height"]+8000 time.sleep(2) driver.set_window_size(1920, total_height) time.sleep(2) imgsrc2 = WebDriverWait(driver,50).until(EC.presence_of_all_elements_located((By.XPATH, "//img"))) time.sleep(5) download_folder = "/Users/rcastong/Desktop/imgs" if not os.path.exists(download_folder): os.makedirs(download_folder) for i in imgsrc2: imgsrc = i.get_attribute("src") str_img = str.encode(imgsrc) with open(os.path.join(download_folder, os.path.basename(imgsrc)), "wb") as f: f.write(str_img)
Advertisement
Answer
You forgot to use requests
to get data from server
response = requests.get(img_src) data = response.content with open(fullname, "wb") as f: f.write(data)
Minimal working example.
It works for me for few first images. Maybe other images need longer sleep()
or it needs to scroll to the bottom to load all src
by JavaScript.
import os import time import requests from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By options = Options() options.headless = True driver = webdriver.Firefox(options=options) driver.get("https://superrare.com/features/the-intersection-of-machine-and-artist") time.sleep(2) #the element with longest height on page root = driver.find_element("xpath", '//div[@id="root"]') total_height = root.size["height"] + 8000 print('total_height:', total_height) time.sleep(2) driver.set_window_size(1920, total_height) time.sleep(2) imgs = WebDriverWait(driver, 50).until(EC.presence_of_all_elements_located((By.XPATH, "//img"))) time.sleep(5) print('len(imgs):', len(imgs)) download_folder = "/Users/rcastong/Desktop/imgs" # it will create only if not exists os.makedirs(download_folder, exist_ok=True) for number, item in enumerate(imgs, 1): print('---', number, '---') img_src = item.get_attribute("src") print('from:', img_src) fullname = os.path.join(download_folder, os.path.basename(img_src)) print(' to:', fullname) response = requests.get(img_src) data = response.content with open(fullname, "wb") as f: f.write(data)