I have this url here, and I’m trying to get the video’s source link, but it’s located within an iframe. The video url is https://ndisk.cizgifilmlerizle.com...
inside an iframe called vjs_iframe
. My code is below:
import requests from bs4 import BeautifulSoup url = "https://m.wcostream.com/my-hero-academia-season-4-episode-5-english-dubbed" r = requests.Session() headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0"} # Noticed that website responds better with headers req = r.get(url, headers=headers) soup = BeautifulSoup(req.content, 'html.parser') iframes = soup.find_all("iframe") # Returns an empty list vjs_iframe = soup.find_all(class_="vjs_iframe") # Also returns an empty list
I don’t know how to get the url within the iframe, since not even the iframe’s source is loaded upon the first request. Is getting the https://ndisk.cizgifilmlerizle.com...
url even possible using BeautifulSoup
or would I need to use another library like selenium
or something else? Thanks in advance!
Advertisement
Answer
My approach to scraping their stuff is as follows. Idk if you need this anymore, but I was searching for problems with that https://ndisk.cizgifilmlerizle.com
website, and saw this. Figured it might help someone else. It’s crude, but gets the job done.
import time from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.chrome.options import Options from selenium.webenter code heredriver.common.by import By from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup from time import sleep import os import string # tab 5, space, up arrow 2, space def do_keys(key, num_times, action_chain): for x in range(num_times): action_chain.send_keys(key) def cls(): print("33[2J") # Press the green button in the gutter to run the script. if __name__ == '__main__': count = 274 # Stuck on 274 - 500. 273 also failed. attempts = 0 while count < 501: url = f"https://www.wcostream.com/naruto-shippuden-episode-{count}" video_dir = f"{os.path.dirname(os.path.realpath(__file__))}\videos\" default_video_name = f"{video_dir}getvid.mp4" if not os.path.exists(video_dir): os.mkdir(video_dir) options = Options() options.add_argument('--headless') options.add_argument('--mute-audio') options.add_experimental_option("prefs", { "download.default_directory": video_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True }) browser = webdriver.Chrome(options=options) # browser = webdriver.Chrome() browser.get(url) sleep(1) title_element = None try: title_element = browser.find_element(By.XPATH, "//*[@id="content"]/table/tbody/tr/td[1]/table/tbody/tr/td/table[1]/tbody/tr[2]/td/div[2]/b/b[1]") except Exception as e: title_element = browser.find_element(By.XPATH, "//*[@id="content"]/table/tbody/tr/td[1]/table/tbody/tr/td/table[1]/tbody/tr[2]/td/div[2]/b[2]") title = title_element.text.lower().translate(str.maketrans('', '', string.punctuation)).replace(' ', '_') new_video_name = f"{video_dir}episode_{count}_{title}.mp4" cls() print(f"Title: {title}") # Below is working. browser.switch_to.frame(browser.find_element(By.XPATH, "//*[@id="frameNewcizgifilmuploads0"]")) results = browser.page_source soup = BeautifulSoup(results, "html.parser") video_url = soup.find("video").get("src") print(f"URL:t{video_url}") browser.get(video_url) element = browser.find_element(By.TAG_NAME, "video") sleep(1) actions = ActionChains(browser) actions.send_keys(Keys.SPACE) actions.perform() sleep(1) do_keys(Keys.TAB, 5, actions) do_keys(Keys.SPACE, 1, actions) do_keys(Keys.UP, 2, actions) do_keys(Keys.SPACE, 1, actions) actions.perform() start = time.time() print(f"Downloading: {new_video_name}") # # # browser.get(video_url) # print(browser) # # # print(results) # print(f"{video_url}") browser_open = True timeout = 0 while browser_open: if os.path.isfile(default_video_name): if os.path.exists(new_video_name): os.remove(default_video_name) end = time.time() print(f"Already Exists! [{end - start}s]") else: os.rename(default_video_name, new_video_name) end = time.time() print(f"Download complete! [{end - start}s]") count += 1 browser_open = False browser.close() try: _ = browser.window_handles except Exception as e: browser_open = False if timeout > 50: attempts += 1 print(f"Download Timed Out. Trying again. [{attempts}]") browser_open = False browser.close() else: attempts = 0 timeout += 1 sleep(1)