Skip to content
Advertisement

Issue with web scraping from website for capturing pagination links

I am trying to scrape data from all listed category URL’s on Home page (Done) and further sub category pages from the website and its Pagination links as well. URL is here

I have created Python script for the same to extract data in Modular structure as I need Output from all URL’s from one step to another in a separate file. But right now I am facing issue with extraction of all pagination URL’s from which data will be fetched afterwards. Also, Instead of data from all listed Sub category URLs I am getting data from First Sub category URL only.

For example in my below script, data from >>>>>

General Practice (Main Category page) – http://www.medicalexpo.com/cat/general-practice-K.html and further Stethoscope (Sub category page) – http://www.medicalexpo.com/medical-manufacturer/stethoscope-2.html

is coming only. I want data from all listed Sub category links as given on this link

Any help would be appreciated to get me desired output having PRODUCT URLs from all listed sub category pages.

Below is the code:

import re
import time
import random
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException, NoSuchElementException 
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import html  
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import os
from fake_useragent import UserAgent

# Function to write data to a file:
def write_to_file(file,mode, data, newline=None, with_tab=None):   #**
    with open(file, mode, encoding='utf-8') as l:
        if with_tab == True:
            data = ''.join(data)
        if newline == True:
            data = data+'n'
        l.write(data)

# Function for data from Module 1:
def send_link(link1):
    browser = webdriver.Chrome()
    browser.get(link1)
    current_page = browser.current_url 
    print (current_page) 
    soup = BeautifulSoup(browser.page_source,"lxml")
    tree = html.fromstring(str(soup))

# Added try and except in order to skip/pass attributes without any value.
    try:
        main_category_url = browser.find_elements_by_xpath("//li[@class="univers-group-item"]/span/a[1][@href]")
        main_category_url = [i.get_attribute("href") for i in main_category_url[4:]]
        print(len(main_category_url))

    except NoSuchElementException:
        main_category_url = ''

    for index, data in enumerate(main_category_url):
        with open('Module_1_OP.tsv', 'a', encoding='utf-8') as outfile:
            data = (main_category_url[index] + "n")
            outfile.write(data)

# Data Extraction for Categories under HEADERS:
    try:
        sub_category_url = browser.find_elements_by_xpath("//li[@class="category-group-item"]/a[1][@href]")
        sub_category_url = [i.get_attribute("href") for i in sub_category_url[:]]
        print(len(sub_category_url))
    except NoSuchElementException:
        sub_category_url = ''

    for index, data in enumerate(sub_category_url):
        with open('Module_1_OP.tsv', 'a', encoding='utf-8') as outfile:
            data = (sub_category_url[index] + "n")
            outfile.write(data)
            
    csvfile = open("Module_1_OP.tsv") 
    csvfilelist = csvfile.readlines()
    send_link2(csvfilelist)

# Function for data from Module 2:
def send_link2(links2): 
    browser = webdriver.Chrome()
    start = 7
    end = 10
    for link2 in (links2[start:end]):    
        print(link2) 

        ua = UserAgent() 
        try:
            ua = UserAgent()
        except FakeUserAgentError:
            pass

        ua.random == 'Chrome'

        proxies = [] 

        t0 = time.time()
        response_delay = time.time() - t0 
        time.sleep(10*response_delay) 
        time.sleep(random.randint(2,5)) 
        browser.get(link2) 
        current_page = browser.current_url 
        print (current_page) 
        soup = BeautifulSoup(browser.page_source,"lxml")
        tree = html.fromstring(str(soup))

        # Added try and except in order to skip/pass attributes without value.
        try:
            product_url = browser.find_elements_by_xpath('//ul[@class="category-grouplist"]/li/a[1][@href]')
            product_url = [i.get_attribute("href") for i in product_url]
            print(len(product_url))
        except NoSuchElementException:
            product_url = ''

        try:
            product_title = browser.find_elements_by_xpath("//ul[@class="category-grouplist"]/li/a[1][@href]") # Use FindelementS for extracting multiple section data
            product_title = [i.text for i in product_title[:]]
            print(product_title)
        except NoSuchElementException:
            product_title = ''
        
        for index, data2 in enumerate(product_title):
            with open('Module_1_2_OP.tsv', 'a', encoding='utf-8') as outfile:
                data2 = (current_page + "t" + product_url[index] + "t" + product_title[index] + "n")
                outfile.write(data2)

        for index, data3 in enumerate(product_title):
            with open('Module_1_2_OP_URL.tsv', 'a', encoding='utf-8') as outfile:
                data3 = (product_url[index] + "n")
                outfile.write(data3)

        csvfile = open("Module_1_2_OP_URL.tsv")
        csvfilelist = csvfile.readlines()
        send_link3(csvfilelist)

# Function for data from Module 3:
def send_link3(csvfilelist): 
    browser = webdriver.Chrome()
    for link3 in csvfilelist[:3]:
        print(link3) 
        browser.get(link3) 
        time.sleep(random.randint(2,5))
        current_page = browser.current_url 
        print (current_page) 
        soup = BeautifulSoup(browser.page_source,"lxml")
        tree = html.fromstring(str(soup))

        try:
            pagination = browser.find_elements_by_xpath("//div[@class="pagination-wrapper"]/a[@href]")
            pagination = [i.get_attribute("href") for i in pagination]
            print(pagination)

        except NoSuchElementException:
            pagination = ''

        for index, data2 in enumerate(pagination):
            with open('Module_1_2_3_OP.tsv', 'a', encoding='utf-8') as outfile:
                data2 = (current_page + "n" + pagination[index] + "n")
                outfile.write(data2)

        dataset = open("Module_1_2_3_OP.tsv") 
        dataset_dup = dataset.readlines() 
        duplicate(dataset_dup)

# Used to remove duplicate records from a List:
def duplicate(dataset):
    dup_items = set()
    uniq_items = []
    for x in dataset:
        if x not in dup_items:
            uniq_items.append(x)
            dup_items.add(x)
            write_to_file('Listing_pagination_links.tsv','w', dup_items, newline=True, with_tab=True)

    csvfile = open("Listing_pagination_links.tsv") 
    csvfilelist = csvfile.readlines()
    send_link4(csvfilelist)

# Function for data from Module 4:
def send_link4(links3):
    browser = webdriver.Chrome()
    for link3 in links3:
      print(link3)
      browser.get(link3) 
      t0 = time.time()
      response_delay = time.time() - t0 
      time.sleep(10*response_delay) 
      time.sleep(random.randint(2,5)) 
      sub_category_page = browser.current_url 
      print (sub_category_page) 
      soup = BeautifulSoup(browser.page_source,"lxml")
      tree = html.fromstring(str(soup))

      # Added try and except in order to skip/pass attributes without value.
      try:
        product_url1 = browser.find_elements_by_xpath('//div[@class="inset-caption price-container"]/a[1][@href]')
        product_url1 = [i.get_attribute("href") for i in product_url1]
        print(len(product_url1))
      except NoSuchElementException:
        product_url1 = ''

      for index, data in enumerate(product_url1):
        with open('Final_Output_' + datestring + '.tsv', 'a', encoding='utf-8') as outfile:
          data = (sub_category_page + "t" + product_url1[index] + "n")
          outfile.write(data)

# PROGRAM STARTS EXECUTING FROM HERE...
# Added to attach Real Date and Time field to Output filename
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M-%S') # For filename
#datestring2 = datetime.strftime(datetime.now(), '%H-%M-%S') # For each record

send_link("http://www.medicalexpo.com/")

Advertisement

Answer

You actually don’t need Selenium for this at all. The code below will fetch categories, sub-categories and item-links, names and description for everything on the site.

The only tricky part is the while-loop that handles the pagination. The principle is that if there’s a “next”-button present on the site, we’ll need to load more content. In this case the sites actually gives us the “next”-link in the next-tag, so its easy to iterate through until there are no more next-links to retrieve.

Keep in mind tho, when you run this, that it might take a while. Keep in mind too, that you probably should insert a sleep – e.g. at 1 second – between each request in the while loop to treat the server nicely.

Doing so would reduce you risk of getting banned/something similar.

import requests
from bs4 import BeautifulSoup
from time import sleep

items_list = [] # list of dictionaries with this content: category, sub_category, item_description, item_name, item_link 

r = requests.get("http://www.medicalexpo.com/")
soup = BeautifulSoup(r.text, "lxml")
cat_items = soup.find_all('li', class_="category-group-item")
cat_items = [[cat_item.get_text().strip(),cat_item.a.get('href')] for cat_item in cat_items]

# cat_items is now a list with elements like this:
# ['General practice','http://www.medicalexpo.com/cat/general-practice-K.html']
# to access the next level, we loop:

for category, category_link in cat_items[:1]:
    print("[*] Extracting data for category: {}".format(category))

    r = requests.get("http://www.medicalexpo.com/cat/general-practice-K.html")
    soup = BeautifulSoup(r.text, "lxml")
    # data of all sub_categories are located in an element with the id 'category-group'
    cat_group = soup.find('div', attrs={'id': 'category-group'})

    # the data lie in 'li'-tags
    li_elements = cat_group.find_all('li')
    sub_links = [[li.a.get('href'), li.get_text().strip()] for li in li_elements]

    # sub_links is now a list og elements like this:
    # ['http://www.medicalexpo.com/medical-manufacturer/stethoscope-2.html', 'Stethoscopes']

    # to access the last level we need to dig further in with a loop
    for sub_category_link, sub_category in sub_links:
        print("  [-] Extracting data for sub_category: {}".format(sub_category))
        local_count = 0
        load_page = True
        item_url = sub_category_link
        while load_page:
            print("     [-] Extracting data for item_url: {}".format(item_url))
            r = requests.get(item_url)
            soup = BeautifulSoup(r.text, "lxml")
            item_links = soup.find_all('div', class_="inset-caption price-container")[2:]
            for item in item_links:
                item_name = item.a.get_text().strip().split('n')[0]
                item_link = item.a.get('href')
                try:
                    item_description = item.a.get_text().strip().split('n')[1]
                except:
                    item_description = None
                item_dict = {
                    "category": category,
                    "subcategory": sub_category,
                    "item_name": item_name,
                    "item_link": item_link,
                    "item_description": item_description
                }
                items_list.append(item_dict)
                local_count +=1
            # all itempages has a pagination element
            # if there are more pages to load, it will have a "next"-class
            # if we are on the last page, the will not be a next class and "next_link" will return None
            pagination = soup.find(class_="pagination-wrapper")
            try:
                next_link = pagination.find(class_="next").get('href', None)
            except:
                next_link = None
            # consider inserting a sleep(1) right about here...
            # if the next_link exists it means that there are more pages to load
            # we'll then set the item_url = next_link and the While-loop will continue
            if next_link is not None:
                item_url = next_link
            else:
                load_page = False
        print("      [-] a total of {} item_links extracted for this sub_category".format(local_count))

# this will yield a list of dicts like this one:

# {'category': 'General practice',
#  'item_description': 'Flac duo',
#  'item_link': 'http://www.medicalexpo.com/prod/boso-bosch-sohn/product-67891-821119.html',
#  'item_name': 'single-head stethoscope',
#  'subcategory': 'Stethoscopes'}

# If you need to export to something like excel, uses pandas. Create a DataFrame and simple load it with the list
# pandas can the export the stuff to excel easily...
User contributions licensed under: CC BY-SA
10 People found this is helpful
Advertisement