Skip to content
Advertisement

Downloading images using src in python produces empty images

My script is kind of working but the files it saves are empty. Any ideas? Forgive me for all the unused import at the top! I tried a lot of different things to do this. In here I’m pulling the img using selenium. The SRCs are then iterated through a loop and transformed into bytes so that they can be written using os.path. I suspect the website is protecting itself against such scraping maybe?

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import os
import urllib
import urllib3
import time
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import requests


driver = webdriver.Firefox()
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get("https://superrare.com/features/the-intersection-of-machine-and-artist")
time.sleep(2)                                                                                                            

#the element with longest height on page
ele=driver.find_element("xpath", '//div[@id="root"]')
total_height = ele.size["height"]+8000
time.sleep(2)  
driver.set_window_size(1920, total_height) 
time.sleep(2)



imgsrc2 = WebDriverWait(driver,50).until(EC.presence_of_all_elements_located((By.XPATH, "//img")))

time.sleep(5)
download_folder = "/Users/rcastong/Desktop/imgs"
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

for i in imgsrc2:
    imgsrc = i.get_attribute("src")
    str_img = str.encode(imgsrc)
    with open(os.path.join(download_folder, os.path.basename(imgsrc)), "wb") as f:
        f.write(str_img)
     

Advertisement

Answer

You forgot to use requests to get data from server

    response = requests.get(img_src)
    data = response.content
    
    with open(fullname, "wb") as f:
        f.write(data)

Minimal working example.

It works for me for few first images. Maybe other images need longer sleep() or it needs to scroll to the bottom to load all src by JavaScript.

import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

driver.get("https://superrare.com/features/the-intersection-of-machine-and-artist")
time.sleep(2)                                                                                                            

#the element with longest height on page
root = driver.find_element("xpath", '//div[@id="root"]')
total_height = root.size["height"] + 8000
print('total_height:', total_height)
time.sleep(2)

driver.set_window_size(1920, total_height) 
time.sleep(2)

imgs = WebDriverWait(driver, 50).until(EC.presence_of_all_elements_located((By.XPATH, "//img")))
time.sleep(5)

print('len(imgs):', len(imgs))

download_folder = "/Users/rcastong/Desktop/imgs"

# it will create only if not exists
os.makedirs(download_folder, exist_ok=True)

for number, item in enumerate(imgs, 1):
    print('---', number, '---')

    img_src = item.get_attribute("src")
    print('from:', img_src)

    fullname = os.path.join(download_folder, os.path.basename(img_src))
    print('  to:', fullname)
    
    response = requests.get(img_src)
    data = response.content
    
    with open(fullname, "wb") as f:
        f.write(data)
User contributions licensed under: CC BY-SA
1 People found this is helpful
Advertisement