I’m following a Selenium tutorial for an Amazon price tracker (Clever Programming on Youtube) and I got stuck at getting the links from amazon using their techniques.
tutorial link: https://www.youtube.com/watch?v=WbJeL_Av2-Q&t=4315s
I realized the problem laid on the fact that I’m only getting one link out of the 17 available after doing the product search. I need to get all the links for every product after doing a search and them use then to get into each product and get their title, seller and price.
funtion get_products_links() should get all links and stores them into a list to be used by the function get_product_info()
def get_products_links(self): self.driver.get(self.base_url) # Go to amazon.com using BASE_URL element = self.driver.find_element_by_id('twotabsearchtextbox') element.send_keys(self.search_term) element.send_keys(Keys.ENTER) time.sleep(2) # Wait to load page self.driver.get(f'{self.driver.current_url}{self.price_filter}') time.sleep(2) # Wait to load page result_list = self.driver.find_elements_by_class_name('s-result-list') links = [] try: ### Tying to get a list for Xpath links attributes ### ### Only numbers from 3 to 17 work after doing product search where 'i' is placed in the XPATH ### i = 3 results = result_list[0].find_elements_by_xpath( f'//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a') links = [link.get_attribute('href') for link in results] return links except Exception as e: print("Didn't get any products...") print(e) return links
At this point get_products_links() only returns one link since I just made ‘i’ a fixed value of 3 to make it work for now.
I was thinking to iterate ‘i’ in some sort so I can save every different PATHs but I don’t know how to implement this.
I’ve tried performing a for loop and append the result into a new list but them the app stops working
Here is the complete code:
from amazon_config import( get_web_driver_options, get_chrome_web_driver, set_browser_as_incognito, set_ignore_certificate_error, NAME, CURRENCY, FILTERS, BASE_URL, DIRECTORY ) import time from selenium.webdriver.common.keys import Keys class GenerateReport: def __init__(self): pass class AmazonAPI: def __init__(self, search_term, filters, base_url, currency): self.base_url = base_url self.search_term = search_term options = get_web_driver_options() set_ignore_certificate_error(options) set_browser_as_incognito(options) self.driver = get_chrome_web_driver(options) self.currency = currency self.price_filter = f"&rh=p_36%3A{filters['min']}00-{filters['max']}00" def run(self): print("Starting script...") print(f"Looking for {self.search_term} products...") links = self.get_products_links() time.sleep(1) if not links: print("Stopped script.") return print(f"Got {len(links)} links to products...") print("Getting info about products...") products = self.get_products_info(links) # self.driver.quit() def get_products_info(self, links): asins = self.get_asins(links) product = [] for asin in asins: product = self.get_single_product_info(asin) def get_single_product_info(self, asin): print(f"Product ID: {asin} - getting data...") product_short_url = self.shorten_url(asin) self.driver.get(f'{product_short_url}?language=en_GB') time.sleep(2) title = self.get_title() seller = self.get_seller() price = self.get_price() def get_title(self): try: return self.driver.find_element_by_id('productTitle') except Exception as e: print(e) print(f"Can't get title of a product - {self.driver.current_url}") return None def get_seller(self): try: return self.driver.find_element_by_id('bylineInfo') except Exception as e: print(e) print(f"Can't get title of a product - {self.driver.current_url}") return None def get_price(self): return '$99' def shorten_url(self, asin): return self.base_url + 'dp/' + asin def get_asins(self, links): return [self.get_asin(link) for link in links] def get_asin(self, product_link): return product_link[product_link.find('/dp/') + 4:product_link.find('/ref')] def get_products_links(self): self.driver.get(self.base_url) # Go to amazon.com using BASE_URL element = self.driver.find_element_by_id('twotabsearchtextbox') element.send_keys(self.search_term) element.send_keys(Keys.ENTER) time.sleep(2) # Wait to load page self.driver.get(f'{self.driver.current_url}{self.price_filter}') time.sleep(2) # Wait to load page result_list = self.driver.find_elements_by_class_name('s-result-list') links = [] try: ### Tying to get a list for Xpath links attributes ### ### Only numbers from 3 to 17 work after doing product search where 'i' is placed ### i = 3 results = result_list[0].find_elements_by_xpath( f'//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a') links = [link.get_attribute('href') for link in results] return links except Exception as e: print("Didn't get any products...") print(e) return links if __name__ == '__main__': print("HEY!!!🚀🔥") amazon = AmazonAPI(NAME, FILTERS, BASE_URL, CURRENCY) amazon.run()
Steps to Run the script:
Step 1: install Selenium==3.141.0 into your virtual environment
Step 2: Search for Chrome Drivers on google and download the driver that matches you Chrome version. After download, extract the driver and paste it into your working folder
Step 3: create a file called amazon_config.py and insert the following code:
from selenium import webdriver DIRECTORY = 'reports' NAME = 'PS4' CURRENCY = '$' MIN_PRICE = '275' MAX_PRICE = '650' FILTERS = { 'min': MIN_PRICE, 'max': MAX_PRICE } BASE_URL = "https://www.amazon.com/" def get_chrome_web_driver(options): return webdriver.Chrome('./chromedriver', chrome_options=options) def get_web_driver_options(): return webdriver.ChromeOptions() def set_ignore_certificate_error(options): options.add_argument('--ignore-certificate-errors') def set_browser_as_incognito(options): options.add_argument('--incognito')
If you performed the steps correctly you should be able to run the script and it will perform the following:
- Go to www.amazon.com
- Search for a product (In this case “PS4”)
- Get a link for the first product
- Visit that product link
Terminal should print:
HEY!!!🚀🔥 Starting script... Looking for PS4 products... Got 1 links to products... Getting info about products... Product ID: B012CZ41ZA - getting data...
What I’m not able to do is to get all links and iterate them so the script will visit all links in the first page
If you are able to get all links, the terminal should print:
HEY!!!🚀🔥 Starting script... Looking for PS4 products... Got 1 links to products... Getting info about products... Product ID: B012CZ41ZA - getting data... Product ID: XXXXXXXXXX - getting data... Product ID: XXXXXXXXXX - getting data... Product ID: XXXXXXXXXX - getting data... # and so on until all links are visited
Advertisement
Answer
I can’t run it so I only guess how I would do it.
I would put all try/except in
for-loop, and use links.append()
instead of links = [...]
, and I would use return
after exiting loop
# --- before loop --- links = [] # --- loop --- for i in range(3, 18): try: results = result_list[0].find_elements_by_xpath( f'//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a') for link in results: links.append(link.get_attribute('href')) except Exception as e: print(f"Didn't get any products... (i = {i})") print(e) # --- after loop --- return links
But I would also try to use xpath
with //
to skip most of divs
– and maybe if I would skip div[{i}]
then I could get all products without for
-loop.
BTW:
In get_products_info()
I see similar problem – you create empty list product = []
but later in loop you assing value to product = ...
so you remove previous value from product
. It would need product.append()
to keep all values.
Something like
def get_products_info(self, links): # --- before loop --- asins = self.get_asins(links) product = [] # --- loop --- for asin in asins: product.append( self.get_single_product_info(asin) ) # --- after loop --- return product