Skip to content
Advertisement

making multiple request at the same time with the module requests and print the output on excel(python)

Hi this is quite a complicated question From this script:

workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")

stock=[here goes a list of 2000+ stock tickers as strings]
sector = []
peg_ratio = []
foward_eps = []
for idx in range(len(stock)):
   url_profile='https://finance.yahoo.com/quote/{}/profile?p={}'
   headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
   response =requests.get(url_profile.format(stock[idx],stock[idx]),headers=headers)
   soup = BeautifulSoup(response.text,'html.parser')
   pattern = re.compile(r's--sDatas--s')
   script_data = soup.find('script',text=pattern).contents[0]
   start = script_data.find("context")-2
   json_data=json.loads(script_data[start:-12])

    try:
        sector.append(json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry'])
    except:
        sector.append("Error")

   url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'
   headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
   response = requests.get(url_stats.format(stock[idx], stock[idx]), headers=headers)
   soup = BeautifulSoup(response.text, 'html.parser')
   pattern = re.compile(r's--sDatas--s')
   script_data = soup.find('script', text=pattern).contents[0]
   start = script_data.find("context") - 2
   json_data = json.loads(script_data[start:-12])
    try:
        peg_ratio.append(
            json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio'][
                'fmt'])
    except:
        peg_ratio.append("Error")
    try:
        foward_eps.append(
            json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps'][
                'fmt'])
    except:
        foward_eps.append("Error")
    worksheet.write("A" + str(idx + 1), stock[idx])
    worksheet.write("B" + str(idx + 1), sector[idx])
    worksheet.write("C" + str(idx+1), foward_eps[idx])
    worksheet.write("D" + str(idx + 1), peg_ratio[idx])
workbook.close()

The code by itself do what is supposed to do (getting the data:foward eps,peg ratio,sector and paste them on an excel file)but the issue is that it takes a lot of time and the list stock is quite long(2531 elements) is there a way to make this code more efficent or faster?

I have attemped to follow the instruction from this video:https://www.youtube.com/watch?v=nFn4_nA_yk8 But i still need to write the information of any single stock in the excel file is there a way i can optimize all of this ? Maybe by sending multiple request at the same time and write the data on the excel on a different time? The only end goal is to make the whole process as fast as possible. Thanks in advance(if you need any other information leave a comment i will answer as soon as possible)

Advertisement

Answer

First you have to put code in function

# --- globals ---

url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'

headers = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}

pattern = re.compile(r's--sDatas--s')

# --- functions ---

def process(number, stock_name):
    print(f'{number} {stock_name}n', end='', flush=True)
    
    url = url_profile.format(stock_name, stock_name)  
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    script_data = soup.find('script', text=pattern).contents[0]
    start = script_data.find("context")-2
    data = json.loads(script_data[start:-12])

    try:
        sector = data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry']
    except:
        sector = "Error"

    url = url_stats.format(stock_name, stock_name)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    script_data = soup.find('script', text=pattern).contents[0]
    start = script_data.find("context") - 2
    data = json.loads(script_data[start:-12])
    
    try:
        peg_ratio = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio']['fmt']
    except:
        peg_ratio = "Error"
        
    try:
        foward_eps = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps']['fmt']
    except:
        foward_eps = "Error"

    # return data - for thread        
    results[number] = (stock_name, sector, foward_eps, foward_eps, peg_ratio)

    # return data - for normal execution
    return (stock_name, sector, foward_eps, foward_eps, peg_ratio)

And next you can run it in old way

stock = ['AAPL', 'GOOG', 'TESL', 'MSFT', 'AAPL', 'GOOG', 'TESL', 'MSFT']

_start = time.time()

results = {}

workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")

for number, stock_name in enumerate(stock, 1):

    data = process(number, stock_name)

    worksheet.write(f"A{number}", data[0]) #stock_name
    worksheet.write(f"B{number}", data[1]) #sector
    worksheet.write(f"C{number}", data[2]) #foward_eps
    worksheet.write(f"D{number}", data[3]) #peg_ratio
    
workbook.close()

_end = time.time()

print(_end - _start)

and this gives me time ~15s, (but sometimes even ~32s)

And now you can use threading to run the same function with different values at the same time.

Because thread can’t return result directly so I use global dictionary results for this (because threads share memory).

stock = ['AAPL', 'GOOG', 'TESL', 'MSFT', 'AAPL', 'GOOG', 'TESL', 'MSFT']

_start = time.time()

threads = []
results = {}

workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")

# start all threads
for number, stock_name in enumerate(stock, 1):
    t = threading.Thread(target=process, args=(number, stock_name))
    t.start()
    threads.append(t)
    
# wait for end of all threads
for t in threads:
    t.join()
    
# use results    
for number, data in results.items():
    #(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
    worksheet.write(f"A{number}", data[0]) #stock_name
    worksheet.write(f"B{number}", data[1]) #sector
    worksheet.write(f"C{number}", data[2]) #foward_eps
    worksheet.write(f"D{number}", data[3]) #peg_ratio
    
workbook.close()

_end = time.time()

print(_end - _start)

And this gives me time ~6s

For more stocks it would be better to use Threading.Pool so it would run only few threads at the same time because running 2000+ threads at the same time is not good idea.


Full working code

import requests
import time
import xlsxwriter
import re
from bs4 import BeautifulSoup
import json
import threading

# --- globals ---

url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'

headers = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}

pattern = re.compile(r's--sDatas--s')

# --- functions ---

def process(number, stock_name):
    print(f'{number} {stock_name}n', end='', flush=True)
    
    url = url_profile.format(stock_name, stock_name)  
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    script_data = soup.find('script', text=pattern).contents[0]
    start = script_data.find("context")-2
    data = json.loads(script_data[start:-12])

    try:
        sector = data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry']
    except:
        sector = "Error"

    url = url_stats.format(stock_name, stock_name)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    script_data = soup.find('script', text=pattern).contents[0]
    start = script_data.find("context") - 2
    data = json.loads(script_data[start:-12])
    
    try:
        peg_ratio = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio']['fmt']
    except:
        peg_ratio = "Error"
        
    try:
        foward_eps = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps']['fmt']
    except:
        foward_eps = "Error"

    # return data - for thread        
    results[number] = (stock_name, sector, foward_eps, foward_eps, peg_ratio)

    # return data - for normal execution
    return (stock_name, sector, foward_eps, foward_eps, peg_ratio)

# --- main ---

stock = [
    'AAPL', 'GOOG', 'TESL', 'MSFT',
    'AAPL', 'GOOG', 'TESL', 'MSFT',
    'AAPL', 'GOOG', 'TESL', 'MSFT',
    'AAPL', 'GOOG', 'TESL', 'MSFT',
]

# --- old version ---

_start = time.time()

results = {}

workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")

for number, stock_name in enumerate(stock, 1):
    data = process(number, stock_name)
    #(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
    worksheet.write(f"A{number}", data[0]) #stock_name
    worksheet.write(f"B{number}", data[1]) #sector
    worksheet.write(f"C{number}", data[2]) #foward_eps
    worksheet.write(f"D{number}", data[3]) #peg_ratio
    
workbook.close()

_end = time.time()

print(_end - _start)

# --- new version ---

_start = time.time()

threads = []
results = {}

workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")

# start all threads
for number, stock_name in enumerate(stock, 1):
    t = threading.Thread(target=process, args=(number, stock_name))
    t.start()
    threads.append(t)
    
# wait for end of all threads
for t in threads:
    t.join()
    
# use results    
for number, data in results.items():
    #(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
    worksheet.write(f"A{number}", data[0]) #stock_name
    worksheet.write(f"B{number}", data[1]) #sector
    worksheet.write(f"C{number}", data[2]) #foward_eps
    worksheet.write(f"D{number}", data[3]) #peg_ratio
    
workbook.close()

_end = time.time()

print(_end - _start)

Version with Pool

import requests
import time
import xlsxwriter
import re
from bs4 import BeautifulSoup
import json
import threading
import threading
from multiprocessing.pool import ThreadPool

# --- globals ---

url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'

headers = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}

pattern = re.compile(r's--sDatas--s')

# --- functions ---

def process(number, stock_name):
    print(f'{number} {stock_name}n', end='', flush=True)
    
    url = url_profile.format(stock_name, stock_name)  
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    script_data = soup.find('script', text=pattern).contents[0]
    start = script_data.find("context")-2
    data = json.loads(script_data[start:-12])

    try:
        sector = data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['industry']
    except:
        sector = "Error"

    url = url_stats.format(stock_name, stock_name)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    script_data = soup.find('script', text=pattern).contents[0]
    start = script_data.find("context") - 2
    data = json.loads(script_data[start:-12])
    
    try:
        peg_ratio = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['pegRatio']['fmt']
    except:
        peg_ratio = "Error"
        
    try:
        foward_eps = data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']['forwardEps']['fmt']
    except:
        foward_eps = "Error"

    # return data - for thread        
    results[number] = (stock_name, sector, foward_eps, foward_eps, peg_ratio)

    # return data - for normal execution
    return (stock_name, sector, foward_eps, foward_eps, peg_ratio)

# --- main ---

stock = [
    'AAPL', 'GOOG', 'TESL', 'MSFT',
    'AAPL', 'GOOG', 'TESL', 'MSFT',
    'AAPL', 'GOOG', 'TESL', 'MSFT',
    'AAPL', 'GOOG', 'TESL', 'MSFT',
]

_start = time.time()

results = {}

workbook = xlsxwriter.Workbook("test.xlsx")
worksheet = workbook.add_worksheet("Stocks")

with ThreadPool(processes=10) as pool:
    pool_results = pool.starmap_async(process, enumerate(stock, 1))
    pool_results = pool_results.get()
    for number, data in enumerate(pool_results, 1):
        #(stock_name, sector, foward_eps, foward_eps, peg_ratio) = data
        worksheet.write(f"A{number}", data[0]) #stock_name
        worksheet.write(f"B{number}", data[1]) #sector
        worksheet.write(f"C{number}", data[2]) #foward_eps
        worksheet.write(f"D{number}", data[3]) #peg_ratio
        
workbook.close()

_end = time.time()

print(_end - _start)

User contributions licensed under: CC BY-SA
3 People found this is helpful
Advertisement