Skip to content
Advertisement

How to iterate a variable in XPATH, extract a link and store it into a list for further iteration

I’m following a Selenium tutorial for an Amazon price tracker (Clever Programming on Youtube) and I got stuck at getting the links from amazon using their techniques.

tutorial link: https://www.youtube.com/watch?v=WbJeL_Av2-Q&t=4315s

I realized the problem laid on the fact that I’m only getting one link out of the 17 available after doing the product search. I need to get all the links for every product after doing a search and them use then to get into each product and get their title, seller and price.

funtion get_products_links() should get all links and stores them into a list to be used by the function get_product_info()

    def get_products_links(self):
    self.driver.get(self.base_url) # Go to amazon.com using BASE_URL
    element = self.driver.find_element_by_id('twotabsearchtextbox')
    element.send_keys(self.search_term)
    element.send_keys(Keys.ENTER)
    time.sleep(2) # Wait to load page
    self.driver.get(f'{self.driver.current_url}{self.price_filter}')
    time.sleep(2) # Wait to load page
    result_list = self.driver.find_elements_by_class_name('s-result-list')

    links = []
    try:
        ### Tying to get a list for Xpath links attributes ###
        ### Only numbers from 3 to 17 work after doing product search where 'i' is placed in the XPATH ###
        i = 3
        results = result_list[0].find_elements_by_xpath(
            f'//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a')
        links = [link.get_attribute('href') for link in results]
        return links
    except Exception as e:
        print("Didn't get any products...")
        print(e)
        return links

At this point get_products_links() only returns one link since I just made ‘i’ a fixed value of 3 to make it work for now.

I was thinking to iterate ‘i’ in some sort so I can save every different PATHs but I don’t know how to implement this.

I’ve tried performing a for loop and append the result into a new list but them the app stops working

Here is the complete code:

from amazon_config import(
get_web_driver_options,
get_chrome_web_driver,
set_browser_as_incognito,
set_ignore_certificate_error,
NAME,
CURRENCY,
FILTERS,
BASE_URL,
DIRECTORY
)
import time
from selenium.webdriver.common.keys import Keys

class GenerateReport:
    def __init__(self):
    pass
class AmazonAPI:
def __init__(self, search_term, filters, base_url, currency):
    self.base_url = base_url
    self.search_term = search_term
    options = get_web_driver_options()
    set_ignore_certificate_error(options)
    set_browser_as_incognito(options)
    self.driver = get_chrome_web_driver(options)
    self.currency = currency
    self.price_filter = f"&rh=p_36%3A{filters['min']}00-{filters['max']}00"
    
def run(self):
    print("Starting script...")
    print(f"Looking for {self.search_term} products...")
    links = self.get_products_links()
    time.sleep(1)
    if not links:
        print("Stopped script.")
        return
    print(f"Got {len(links)} links to products...")
    print("Getting info about products...")
    products = self.get_products_info(links)

    # self.driver.quit()

def get_products_info(self, links):
    asins = self.get_asins(links)
    product = []
    for asin in asins:
        product = self.get_single_product_info(asin)

def get_single_product_info(self, asin):
    print(f"Product ID: {asin} - getting data...")
    product_short_url = self.shorten_url(asin)
    self.driver.get(f'{product_short_url}?language=en_GB')
    time.sleep(2)
    title = self.get_title()
    seller = self.get_seller()
    price = self.get_price()

def get_title(self):
    try:
        return self.driver.find_element_by_id('productTitle')
    except Exception as e:
        print(e)
        print(f"Can't get title of a product - {self.driver.current_url}")
        return None

def get_seller(self):
    try:
        return self.driver.find_element_by_id('bylineInfo')
    except Exception as e:
        print(e)
        print(f"Can't get title of a product - {self.driver.current_url}")
        return None

def get_price(self):
    return '$99'

def shorten_url(self, asin):
    return self.base_url + 'dp/' + asin

def get_asins(self, links):
    return [self.get_asin(link) for link in links]

def get_asin(self, product_link):
    return product_link[product_link.find('/dp/') + 4:product_link.find('/ref')]
    
def get_products_links(self):
    self.driver.get(self.base_url) # Go to amazon.com using BASE_URL
    element = self.driver.find_element_by_id('twotabsearchtextbox')
    element.send_keys(self.search_term)
    element.send_keys(Keys.ENTER)
    time.sleep(2) # Wait to load page
    self.driver.get(f'{self.driver.current_url}{self.price_filter}')
    time.sleep(2) # Wait to load page
    result_list = self.driver.find_elements_by_class_name('s-result-list')

    links = []
    try:
        ### Tying to get a list for Xpath links attributes ###
        ### Only numbers from 3 to 17 work after doing product search where 'i' is placed ###
        i = 3
        results = result_list[0].find_elements_by_xpath(
            f'//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a')
            
        links = [link.get_attribute('href') for link in results]
        return links
    except Exception as e:
        print("Didn't get any products...")
        print(e)
        return links


  if __name__ == '__main__':
print("HEY!!!🚀🔥")
amazon = AmazonAPI(NAME, FILTERS, BASE_URL, CURRENCY)
amazon.run()

Steps to Run the script:

Step 1: install Selenium==3.141.0 into your virtual environment

Step 2: Search for Chrome Drivers on google and download the driver that matches you Chrome version. After download, extract the driver and paste it into your working folder

Step 3: create a file called amazon_config.py and insert the following code:

from selenium import webdriver

DIRECTORY = 'reports'
NAME = 'PS4'
CURRENCY = '$'
MIN_PRICE = '275'
MAX_PRICE = '650'
FILTERS = {
  'min': MIN_PRICE,
  'max': MAX_PRICE
}
BASE_URL = "https://www.amazon.com/"

def get_chrome_web_driver(options):
  return webdriver.Chrome('./chromedriver', chrome_options=options)

def get_web_driver_options():
  return webdriver.ChromeOptions()

def set_ignore_certificate_error(options):
  options.add_argument('--ignore-certificate-errors')

def set_browser_as_incognito(options):
  options.add_argument('--incognito')

If you performed the steps correctly you should be able to run the script and it will perform the following:

  1. Go to www.amazon.com
  2. Search for a product (In this case “PS4”)
  3. Get a link for the first product
  4. Visit that product link

Terminal should print:

HEY!!!🚀🔥
Starting script...
Looking for PS4 products...
Got 1 links to products...
Getting info about products...
Product ID: B012CZ41ZA - getting data...

What I’m not able to do is to get all links and iterate them so the script will visit all links in the first page

If you are able to get all links, the terminal should print:

HEY!!!🚀🔥
Starting script...
Looking for PS4 products...
Got 1 links to products...
Getting info about products...
Product ID: B012CZ41ZA - getting data...
Product ID: XXXXXXXXXX - getting data...
Product ID: XXXXXXXXXX - getting data...
Product ID: XXXXXXXXXX - getting data...
 # and so on until all links are visited 

Advertisement

Answer

I can’t run it so I only guess how I would do it.

I would put all try/except in for-loop, and use links.append() instead of links = [...], and I would use return after exiting loop

    # --- before loop ---
    
    links = []
    
    # --- loop ---
    
    for i in range(3, 18):
        try:
            results = result_list[0].find_elements_by_xpath(
            f'//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a')
            
            for link in results:
                links.append(link.get_attribute('href'))
                
        except Exception as e:
            print(f"Didn't get any products... (i = {i})")
            print(e)
        
    # --- after loop ---
    
    return links

But I would also try to use xpath with // to skip most of divs – and maybe if I would skip div[{i}] then I could get all products without for-loop.


BTW:

In get_products_info() I see similar problem – you create empty list product = [] but later in loop you assing value to product = ... so you remove previous value from product. It would need product.append() to keep all values.

Something like

def get_products_info(self, links):

    # --- before loop ---
    asins = self.get_asins(links)
    product = []

    # --- loop ---
    for asin in asins:
        product.append( self.get_single_product_info(asin) )

    # --- after loop ---
    return product
User contributions licensed under: CC BY-SA
3 People found this is helpful
Advertisement