“Questions seeking debugging help (“Why isn’t this code working?”) must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself.”
The desired behavior is to create an output file of scraped pages as per this working code (in non headless mode) and here’s the shortest code necessary to reproduce it in the question itself.
# script_concurrent.py
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from selenium import webdriver
import datetime
import os
from scrapers.scraper import connect_to_base, parse_html, write_to_file
def counted(f):
def wrapped(*args, **kwargs):
wrapped.calls += 1
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
def sleepy(f):
def wrapped(*args, **kwargs):
with lock:
wrapped.calls += 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 20 == 0:
print(colored("Sleeping...", "blue"))
sleep(randint(60, 65))
return f(*args, **kwargs)
lock = threading.Lock()
wrapped.calls = 0
return wrapped
@counted
@sleepy
def run_process(filename="Hitachi.csv"):
# init browser
os.environ["WDM_LOG_LEVEL"] = "0"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
if connect_to_base(browser):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print("Error connecting to AVS")
# exit
browser.quit()
if __name__ == "__main__":
start_time = time()
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_filename = f"output_{output_timestamp}.csv"
futures = []
with ThreadPoolExecutor() as executor:
futures.extend(executor.submit(run_process) for _ in range(2, 12))
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
print(f"Calls to run_process: {run_process.calls}")
# scraper.py
import requests
import csv
from pathlib import Path
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
def csv_to_iter(filename, idx=0):
pd.set_option("display.max_rows", None)
df = pd.read_csv(filename)
df = df.iloc[:, [idx]]
df = df.values.tolist()
df = list(itertools.chain(*df))
df = sorted(list(set(df)))
return iter(df)
my_iter = csv_to_iter(
filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)
def connect_to_base(browser):
my_next_iter = next(my_iter)
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(my_next_iter)
# wait for table element with id = 'content' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
)
return True
except Exception as e:
print(e)
connection_attempts += 1
print(f"Error connecting to {my_next_iter}.")
print(f"Attempt #{connection_attempts}.")
return False
def parse_html(html):
# create soup object
soup = BeautifulSoup(html, "html.parser")
# parse soup object to get wikipedia article url, title, and last modified date
# part_position = [
# item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
# ]
part_number_1 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part â„–"})
]
part_number_2 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
]
if not part_number_1:
pass
else:
part_number = part_number_1
if not part_number_2:
pass
else:
part_number = part_number_2
part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]
part_name = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
]
part_comments = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
]
machine = [
item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
]
alternative_machines = [
item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
]
title = [item.text for item in soup.findAll("span", {"class": "trans"})]
parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]
article_info = {
# "Pos.": part_position,
"Part No": part_number,
"Qty": part_qty,
"Parts name": part_name,
"Comments": part_comments,
"Machine": machine,
"Alternative_machines": alternative_machines,
"Title": title,
"Parts_group": parts_group,
}
return [article_info]
def get_load_time(article_url):
try:
# set headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000
)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as e:
print(e)
load_time = "Loading Error"
return load_time
def write_to_file(output_list, filename="Hitachi.csv"):
for row in output_list:
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
fieldnames = [
"Pos.",
"Part No",
"Qty",
"Parts name",
"Comments",
"Machine",
"Alternative_machines",
"Title",
"Parts_group",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
Output
run_process called 1 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,409 INFO ====== WebDriver manager ====== run_process called 2 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,410 INFO ====== WebDriver manager ====== run_process called 3 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,410 INFO ====== WebDriver manager ====== run_process called 4 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,415 INFO ====== WebDriver manager ====== run_process called 5 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,417 INFO ====== WebDriver manager ====== run_process called 6 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,418 INFO ====== WebDriver manager ====== run_process called 7 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,420 INFO ====== WebDriver manager ====== run_process called 8 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:35:26,426 INFO ====== WebDriver manager ====== [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,616 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,695 INFO Current google-chrome version is 103.0.5060 [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,697 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,700 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,699 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,701 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,699 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,710 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,710 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,713 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,713 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:35:26,717 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:35:26,717 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 (.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py run_process called 1 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,472 INFO ====== WebDriver manager ====== run_process called 2 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,476 INFO ====== WebDriver manager ====== run_process called 3 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,479 INFO ====== WebDriver manager ====== run_process called 4 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:36:45,480 INFO ====== WebDriver manager ====== run_process called 5 times Sleeping... [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:36:45,616 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:36:45,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:36:45,650 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:36:45,650 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:36:45,660 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:36:45,660 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 (.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py run_process called 1 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,546 INFO ====== WebDriver manager ====== run_process called 2 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,550 INFO ====== WebDriver manager ====== run_process called 3 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,555 INFO ====== WebDriver manager ====== run_process called 4 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:37:46,695 INFO ====== WebDriver manager ====== [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,708 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,708 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,724 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,725 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,733 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,734 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:37:46,752 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:37:46,753 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome run_process called 5 times Sleeping... [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,844 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:37:46,942 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/ https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/ https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/ https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/ [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,773 INFO ====== WebDriver manager ====== run_process called 6 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,778 INFO ====== WebDriver manager ====== run_process called 7 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,783 INFO ====== WebDriver manager ====== run_process called 8 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,793 INFO ====== WebDriver manager ====== run_process called 9 times [WDM] - ====== WebDriver manager ====== 2022-07-13 15:38:48,802 INFO ====== WebDriver manager ====== run_process called 10 times Sleeping... [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,947 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,948 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,964 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,964 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,967 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,967 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,971 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,973 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:38:48,989 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:38:48,994 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,065 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,108 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,129 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,181 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:38:49,189 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/ https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/ https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/ https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/ https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/ [WDM] - ====== WebDriver manager ====== 2022-07-13 15:39:49,816 INFO ====== WebDriver manager ====== [WDM] - Current google-chrome version is 103.0.5060 2022-07-13 15:39:50,147 INFO Current google-chrome version is 103.0.5060 [WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome 2022-07-13 15:39:50,148 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome [WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache 2022-07-13 15:39:50,368 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/ Elapsed run time: 2.27 minutes. Calls to run_process: 10
Data
0 https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/ https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/ https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/ https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/ https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/ https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/ https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/ https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/ https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/ https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/ https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/ https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/ https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
Here’s my attempt to implement Headless Mode.
def get_driver(headless):
options = webdriver.Options()
if headless:
options.add_argument("--headless")
# initialize driver
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=options
)
return driver
# script_concurrent.py
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from termcolor import colored
from random import randint
import threading
import datetime
import sys
from scrapers.scraper import get_driver, connect_to_base, parse_html, write_to_file
def counted(f):
def wrapped(*args, **kwargs):
wrapped.calls += 1
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
def sleepy(f):
def wrapped(*args, **kwargs):
with lock:
wrapped.calls += 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 20 == 0:
print(colored("Sleeping...", "blue"))
sleep(randint(60, 65))
return f(*args, **kwargs)
lock = threading.Lock()
wrapped.calls = 0
return wrapped
@counted
@sleepy
def run_process(filename, headless):
# init browser
browser = get_driver(headless)
if connect_to_base(browser):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
# exit
browser.quit()
else:
print("Error connecting to AVS")
browser.quit()
if __name__ == "__main__":
headless = False
if len(sys.argv) > 1:
if sys.argv[1] == "headless":
print("Running in headless mode")
headless = True
start_time = time()
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_filename = f"Hitachi_{output_timestamp}.csv"
futures = []
with ThreadPoolExecutor() as executor:
futures.extend(
executor.submit(run_process, output_filename, headless)
for _ in range(2, 202)
)
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
print(f"Calls to run_process: {run_process.calls}")
# script.py
import csv
import requests
import itertools
import pandas as pd
from pathlib import Path
from selenium import webdriver
from termcolor import colored
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
def csv_to_iter(filename, idx=0):
pd.set_option("display.max_rows", None)
df = pd.read_csv(filename)
df = df.iloc[:, [idx]]
df = df.values.tolist()
df = list(itertools.chain(*df))
df = sorted(list(set(df)))
return iter(df)
my_iter = csv_to_iter(
filename="/Users/martinhewing/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)
def get_driver(headless):
options = webdriver.Options()
if headless:
options.add_argument("--headless")
# initialize driver
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=options
)
return driver
def connect_to_base(browser):
my_next_iter = next(my_iter)
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(my_next_iter)
print(colored(browser.current_url, "green"))
# wait for table element with id = 'content' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
)
return True
except Exception as e:
print(e)
connection_attempts += 1
print(f"Error connecting to {my_next_iter}.")
print(f"Attempt #{connection_attempts}.")
return False
def parse_html(html):
# create soup object
soup = BeautifulSoup(html, "html.parser")
# parse soup object to get wikipedia article url, title, and last modified date
# part_position = [
# item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
# ]
part_number_1 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part â"})
]
part_number_2 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
]
if not part_number_1:
pass
else:
part_number = part_number_1
if not part_number_2:
pass
else:
part_number = part_number_2
part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]
part_name = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
]
part_comments = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
]
machine = [
item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
]
alternative_machines = [
item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
]
title = [item.text for item in soup.findAll("span", {"class": "trans"})]
parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]
article_info = {
# "Pos.": part_position,
"Part No": part_number,
"Qty": part_qty,
"Parts name": part_name,
"Comments": part_comments,
"Machine": machine,
"Alternative_machines": alternative_machines,
"Title": title,
"Parts_group": parts_group,
}
return [article_info]
def get_load_time(article_url):
try:
# set headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000
)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as e:
print(e)
load_time = "Loading Error"
return load_time
def write_to_file(output_list, filename):
for row in output_list:
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
fieldnames = [
"Pos.",
"Part No",
"Qty",
"Parts name",
"Comments",
"Machine",
"Alternative_machines",
"Title",
"Parts_group",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
Output
Running in headless mode run_process called 1 times run_process called 2 times run_process called 3 times run_process called 4 times run_process called 5 times run_process called 6 times run_process called 7 times run_process called 8 times run_process called 9 times run_process called 10 times run_process called 11 times run_process called 12 times run_process called 13 times run_process called 14 times run_process called 15 times run_process called 16 times run_process called 17 times run_process called 18 times run_process called 19 times run_process called 20 times Sleeping...
Data
0 https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/ https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/ https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/ https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/ https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/ https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/ https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/ https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/ https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/ https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/ https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/ https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/ https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
When I run in headless mode there’s no error but no output either, I have reviewed similar questions, however, I am at a lost to understand what might be causing this to happen. Please help:)
Advertisement
Answer
Website Blocking:
The website could be detecting you scrape. There are a couple different solutions you could try.
Change your user agent:
chrome_options.add_argument("USER AGENT")
Change the “User Agent” string with the contents shown in this link: What is my user agent
Selenium Stealth:
stealth(driver,
user_agent: 'USER AGENT',
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
Selenium Stealth is a python package that is paired with Selenium and used to prevent detection. It manipulates key elements of your Selenium browser in order to bypass bot detection software.