“Questions seeking debugging help (“Why isn’t this code working?”) must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself.”
The desired behavior is to create an output file of scraped pages as per this working code (in non headless mode) and here’s the shortest code necessary to reproduce it in the question itself.
# script_concurrent.py from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service from concurrent.futures import ThreadPoolExecutor, wait from time import sleep, time from selenium import webdriver import datetime import os from scrapers.scraper import connect_to_base, parse_html, write_to_file def counted(f): def wrapped(*args, **kwargs): wrapped.calls += 1 return f(*args, **kwargs) wrapped.calls = 0 return wrapped def sleepy(f): def wrapped(*args, **kwargs): with lock: wrapped.calls += 1 print(f"{f.__name__} called {wrapped.calls} times") if wrapped.calls % 20 == 0: print(colored("Sleeping...", "blue")) sleep(randint(60, 65)) return f(*args, **kwargs) lock = threading.Lock() wrapped.calls = 0 return wrapped @counted @sleepy def run_process(filename="Hitachi.csv"): # init browser os.environ["WDM_LOG_LEVEL"] = "0" browser = webdriver.Chrome(service=Service(ChromeDriverManager().install())) if connect_to_base(browser): sleep(2) html = browser.page_source output_list = parse_html(html) write_to_file(output_list, filename) else: print("Error connecting to AVS") # exit browser.quit() if __name__ == "__main__": start_time = time() output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") output_filename = f"output_{output_timestamp}.csv" futures = [] with ThreadPoolExecutor() as executor: futures.extend(executor.submit(run_process) for _ in range(2, 12)) wait(futures) end_time = time() elapsed_time = end_time - start_time print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.") print(f"Calls to run_process: {run_process.calls}")
# scraper.py import requests import csv from pathlib import Path import itertools import pandas as pd from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait BASE_DIR = Path(__file__).resolve(strict=True).parent.parent def csv_to_iter(filename, idx=0): pd.set_option("display.max_rows", None) df = pd.read_csv(filename) df = df.iloc[:, [idx]] df = df.values.tolist() df = list(itertools.chain(*df)) df = sorted(list(set(df))) return iter(df) my_iter = csv_to_iter( filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv" ) def connect_to_base(browser): my_next_iter = next(my_iter) connection_attempts = 0 while connection_attempts < 3: try: browser.get(my_next_iter) # wait for table element with id = 'content' to load # before returning True WebDriverWait(browser, 5).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".container")) ) return True except Exception as e: print(e) connection_attempts += 1 print(f"Error connecting to {my_next_iter}.") print(f"Attempt #{connection_attempts}.") return False def parse_html(html): # create soup object soup = BeautifulSoup(html, "html.parser") # parse soup object to get wikipedia article url, title, and last modified date # part_position = [ # item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."}) # ] part_number_1 = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Part â„–"}) ] part_number_2 = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"}) ] if not part_number_1: pass else: part_number = part_number_1 if not part_number_2: pass else: part_number = part_number_2 part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})] part_name = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"}) ] part_comments = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"}) ] machine = [ item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"}) ] alternative_machines = [ item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"}) ] title = [item.text for item in soup.findAll("span", {"class": "trans"})] parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})] article_info = { # "Pos.": part_position, "Part No": part_number, "Qty": part_qty, "Parts name": part_name, "Comments": part_comments, "Machine": machine, "Alternative_machines": alternative_machines, "Title": title, "Parts_group": parts_group, } return [article_info] def get_load_time(article_url): try: # set headers headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" } # make get request to article_url response = requests.get( article_url, headers=headers, stream=True, timeout=3.000 ) # get page load time load_time = response.elapsed.total_seconds() except Exception as e: print(e) load_time = "Loading Error" return load_time def write_to_file(output_list, filename="Hitachi.csv"): for row in output_list: with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile: fieldnames = [ "Pos.", "Part No", "Qty", "Parts name", "Comments", "Machine", "Alternative_machines", "Title", "Parts_group", ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow(row)
Output
run_process called 1 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,409 INFO ====== WebDriver manager ====== run_process called 2 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,410 INFO ====== WebDriver manager ====== run_process called 3 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,410 INFO ====== WebDriver manager ====== run_process called 4 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,415 INFO ====== WebDriver manager ====== run_process called 5 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,417 INFO ====== WebDriver manager ====== run_process called 6 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,418 INFO ====== WebDriver manager ====== run_process called 7 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,420 INFO ====== WebDriver manager ====== run_process called 8 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,426 INFO ====== WebDriver manager ====== [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,616 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,695 INFO Current google-chrome version is 103.0.5060 [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,697 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,700 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,699 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,701 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,699 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,710 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,710 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,713 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,713 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,717 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,717 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 (.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py run_process called 1 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,472 INFO ====== WebDriver manager ====== run_process called 2 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,476 INFO ====== WebDriver manager ====== run_process called 3 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,479 INFO ====== WebDriver manager ====== run_process called 4 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,480 INFO ====== WebDriver manager ====== run_process called 5 times Sleeping... [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:36:45,616 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:36:45,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:36:45,650 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:36:45,650 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:36:45,660 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:36:45,660 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 (.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py run_process called 1 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,546 INFO ====== WebDriver manager ====== run_process called 2 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,550 INFO ====== WebDriver manager ====== run_process called 3 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,555 INFO ====== WebDriver manager ====== run_process called 4 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,695 INFO ====== WebDriver manager ====== [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,708 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,708 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,724 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,725 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,733 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,734 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,752 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,753 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome run_process called 5 times Sleeping... [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,844 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,942 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/ https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/ https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/ https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/ [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,773 INFO ====== WebDriver manager ====== run_process called 6 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,778 INFO ====== WebDriver manager ====== run_process called 7 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,783 INFO ====== WebDriver manager ====== run_process called 8 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,793 INFO ====== WebDriver manager ====== run_process called 9 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,802 INFO ====== WebDriver manager ====== run_process called 10 times Sleeping... [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,947 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,948 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,964 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,964 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,967 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,967 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,971 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,973 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,989 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,994 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,065 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,108 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,129 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,181 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,189 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/ https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/ https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/ https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/ https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/ [WDM] - ====== WebDriver manager ====== 2022-07-13 15:39:49,816 INFO ====== WebDriver manager ====== [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:39:50,147 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:39:50,148 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:39:50,368 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/ Elapsed run time: 2.27 minutes. Calls to run_process: 10
Data
0 https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/ https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/ https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/ https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/ https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/ https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/ https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/ https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/ https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/ https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/ https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/ https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/ https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
Here’s my attempt to implement Headless Mode.
def get_driver(headless): options = webdriver.Options() if headless: options.add_argument("--headless") # initialize driver driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=options ) return driver
# script_concurrent.py from concurrent.futures import ThreadPoolExecutor, wait from time import sleep, time from termcolor import colored from random import randint import threading import datetime import sys from scrapers.scraper import get_driver, connect_to_base, parse_html, write_to_file def counted(f): def wrapped(*args, **kwargs): wrapped.calls += 1 return f(*args, **kwargs) wrapped.calls = 0 return wrapped def sleepy(f): def wrapped(*args, **kwargs): with lock: wrapped.calls += 1 print(f"{f.__name__} called {wrapped.calls} times") if wrapped.calls % 20 == 0: print(colored("Sleeping...", "blue")) sleep(randint(60, 65)) return f(*args, **kwargs) lock = threading.Lock() wrapped.calls = 0 return wrapped @counted @sleepy def run_process(filename, headless): # init browser browser = get_driver(headless) if connect_to_base(browser): sleep(2) html = browser.page_source output_list = parse_html(html) write_to_file(output_list, filename) # exit browser.quit() else: print("Error connecting to AVS") browser.quit() if __name__ == "__main__": headless = False if len(sys.argv) > 1: if sys.argv[1] == "headless": print("Running in headless mode") headless = True start_time = time() output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") output_filename = f"Hitachi_{output_timestamp}.csv" futures = [] with ThreadPoolExecutor() as executor: futures.extend( executor.submit(run_process, output_filename, headless) for _ in range(2, 202) ) wait(futures) end_time = time() elapsed_time = end_time - start_time print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.") print(f"Calls to run_process: {run_process.calls}")
# script.py import csv import requests import itertools import pandas as pd from pathlib import Path from selenium import webdriver from termcolor import colored from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait BASE_DIR = Path(__file__).resolve(strict=True).parent.parent def csv_to_iter(filename, idx=0): pd.set_option("display.max_rows", None) df = pd.read_csv(filename) df = df.iloc[:, [idx]] df = df.values.tolist() df = list(itertools.chain(*df)) df = sorted(list(set(df))) return iter(df) my_iter = csv_to_iter( filename="/Users/martinhewing/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv" ) def get_driver(headless): options = webdriver.Options() if headless: options.add_argument("--headless") # initialize driver driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=options ) return driver def connect_to_base(browser): my_next_iter = next(my_iter) connection_attempts = 0 while connection_attempts < 3: try: browser.get(my_next_iter) print(colored(browser.current_url, "green")) # wait for table element with id = 'content' to load # before returning True WebDriverWait(browser, 5).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".container")) ) return True except Exception as e: print(e) connection_attempts += 1 print(f"Error connecting to {my_next_iter}.") print(f"Attempt #{connection_attempts}.") return False def parse_html(html): # create soup object soup = BeautifulSoup(html, "html.parser") # parse soup object to get wikipedia article url, title, and last modified date # part_position = [ # item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."}) # ] part_number_1 = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Part â"}) ] part_number_2 = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"}) ] if not part_number_1: pass else: part_number = part_number_1 if not part_number_2: pass else: part_number = part_number_2 part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})] part_name = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"}) ] part_comments = [ item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"}) ] machine = [ item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"}) ] alternative_machines = [ item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"}) ] title = [item.text for item in soup.findAll("span", {"class": "trans"})] parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})] article_info = { # "Pos.": part_position, "Part No": part_number, "Qty": part_qty, "Parts name": part_name, "Comments": part_comments, "Machine": machine, "Alternative_machines": alternative_machines, "Title": title, "Parts_group": parts_group, } return [article_info] def get_load_time(article_url): try: # set headers headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" } # make get request to article_url response = requests.get( article_url, headers=headers, stream=True, timeout=3.000 ) # get page load time load_time = response.elapsed.total_seconds() except Exception as e: print(e) load_time = "Loading Error" return load_time def write_to_file(output_list, filename): for row in output_list: with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile: fieldnames = [ "Pos.", "Part No", "Qty", "Parts name", "Comments", "Machine", "Alternative_machines", "Title", "Parts_group", ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow(row)
Output
Running in headless mode run_process called 1 times run_process called 2 times run_process called 3 times run_process called 4 times run_process called 5 times run_process called 6 times run_process called 7 times run_process called 8 times run_process called 9 times run_process called 10 times run_process called 11 times run_process called 12 times run_process called 13 times run_process called 14 times run_process called 15 times run_process called 16 times run_process called 17 times run_process called 18 times run_process called 19 times run_process called 20 times Sleeping...
Data
0 https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/ https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/ https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/ https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/ https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/ https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/ https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/ https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/ https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/ https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/ https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/ https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/ https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
When I run in headless mode there’s no error but no output either, I have reviewed similar questions, however, I am at a lost to understand what might be causing this to happen. Please help:)
Advertisement
Answer
Website Blocking:
The website could be detecting you scrape. There are a couple different solutions you could try.
Change your user agent:
chrome_options.add_argument("USER AGENT")
Change the “User Agent” string with the contents shown in this link: What is my user agent
Selenium Stealth:
stealth(driver, user_agent: 'USER AGENT', languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True, )
Selenium Stealth is a python package that is paired with Selenium and used to prevent detection. It manipulates key elements of your Selenium browser in order to bypass bot detection software.