Skip to content
Advertisement

Calculating a for loop with different indexes simultaneosuly

I have the following for function:

def calculateEMAs(df,startIndex,endIndex):
    for index,row in df.iterrows():
        for i in range (1,51):
            if(index-i > 0):
              df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
    print(df)
    

This for loop takes a long time to calculate the values for the data frame as it has to loop 50 times for each row (it takes approximately 62 seconds)

I tried to use multiprocessor pool from this question. My code looks like this now:

def calculateEMAs(df,startIndex,endIndex):
    for index,row in df.iterrows():
        for i in range (startIndex,endIndex):
            if(index-i > 0):
              df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
    print(df)
  
def main():
    dfClosePrice= getFileDataframe().to_frame()
    pool = Pool()
    time0 = time.time()
    result1 = pool.apply_async(calculateEMAs,[dfClosePrice,1,10])
    result2 = pool.apply_async(calculateEMAs,[dfClosePrice,10,20])
    result3 = pool.apply_async(calculateEMAs,[dfClosePrice,20,30])
    result4 = pool.apply_async(calculateEMAs,[dfClosePrice,30,40])
    result5 = pool.apply_async(calculateEMAs,[dfClosePrice,40,51])
    answer1 = result1.get()
    answer2 = result2.get()
    answer3 = result3.get()
    answer4 = result4.get()
    answer5 = result5.get()
    print(time.time() - time0)
    print(dfClosePrice)

I run the function asynchronously with different values for the for loop. this takes 19 seconds to complete and I can see the result of each function printed correctly but the final value of dfClosePirce is a dataframe with only 1 column (Trade Close) and the new columns from each async function will not be added to the dataframe. How can I do it the right way?

Advertisement

Answer

Solution Using Numpy vectorization

Issue

  1. Line if(index-i > 0): should be if(index-i >= 0): otherwise we miss the difference of 1
  2. Use ‘Close’ rather than ‘Trade Close’ (doesn’t matter for performance but avoid renaming column after pulling data from web)

Code

import numpy as np
import pandas as pd

def compute_using_np(df, start_index, end_index):
    '''
        Using numpy to vectorize computation
    '''
    nrows = len(df)                         
    ncols = end_index - start_index

    # container for pairwise differences
    pair_wise_diff = np.empty((nrows, ncols))  #np.zeros((nrows, ncols), dtype = float)
    pair_wise_diff.fill(np.nan)
    # Get values of Trading close column as numpy 1D array
    values = df['Close'].values

    # Compute differences for different offsets
    for offset in range(startIndex, endIndex):
        # Using numpy to compute vectorized difference (i.e. faster computation)
        diff = np.abs(values[offset:] - values[:-offset])/2.0
                              
        # Update result
        pair_wise_diff[offset:, offset-startIndex] = diff
                              
    # Place into DataFrame
    columns = ["EMA%d"%i for i in range(start_index, end_index)]
                              
    df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
            
    # Add result to df merging on index
    return df.join(df_result)

Usage

df_result = compute_using_np(df, 1, 51)

Performance

Summary

  • Posted Code: 37.9 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
  • Numpy Code: 1.56 ms ± 27.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  • Result: 20K times speed up

Test Code

import pandas_datareader as dr
import pandas as pd
import numpy as np

def calculateEMAs(df, start_index, end_index):
    '''
       Posted code changed 1) use Python PEP 8 naming convention, 
                           2) corrected conditional
    '''
    for index,row in df.iterrows():
        for i in range (start_index, end_index):
            if(index-i >= 0):
              df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Close"] - df.iloc[index]["Close"])/2 #replace this with EMA formula
    return df

def compute_using_np(df, start_index, end_index):
    '''
        Using numpy to vectorie computation
    '''
    nrows = len(df)                         
    
    ncols = end_index - start_index

    # container for pairwise differences
    pair_wise_diff = np.empty((nrows, ncols))  #np.zeros((nrows, ncols), dtype = float)
    pair_wise_diff.fill(np.nan)
    # Get values of Trading close column as numpy 1D array
    values = df['Close'].values

    # Compute differences for different offsets
    for offset in range(start_index, end_index):
        # Using numpy to compute vectorized difference (i.e. faster computation)
        diff = np.abs(values[offset:] - values[:-offset])/2.0
                              
        # Update result
        pair_wise_diff[offset:, offset-start_index] = diff
                              
    # Place into DataFrame
    columns = ["EMA%d"%i for i in range(start_index, end_index)]
                              
    df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
            
    # Add result to df merging on index
    return df.join(df_result)

# Get ibm closing stock pricing (777 DataFrame rows)
df = dr.data.get_data_yahoo('ibm', start = '2017-09-01', end = '2020-10-02')
df.reset_index(level=0, inplace = True)   # create index which is 0, 1, 2, ...

# Time Original post
df1 = df.copy()                    # Copy data since operation is inplace
%timeit calculateEMAs(df1, 1, 51)  # Jupyter Notebook Magic method

# Time Numpy Version
%timeit compute_using_np(df, 1, 51)  # Jupyter Notebook Magic method 
                                     # No need to copy since operation is not inplace
User contributions licensed under: CC BY-SA
10 People found this is helpful
Advertisement