I have the following for function:
def calculateEMAs(df,startIndex,endIndex): for index,row in df.iterrows(): for i in range (1,51): if(index-i > 0): df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula print(df)
This for loop takes a long time to calculate the values for the data frame as it has to loop 50 times for each row (it takes approximately 62 seconds)
I tried to use multiprocessor pool from this question. My code looks like this now:
def calculateEMAs(df,startIndex,endIndex): for index,row in df.iterrows(): for i in range (startIndex,endIndex): if(index-i > 0): df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula print(df) def main(): dfClosePrice= getFileDataframe().to_frame() pool = Pool() time0 = time.time() result1 = pool.apply_async(calculateEMAs,[dfClosePrice,1,10]) result2 = pool.apply_async(calculateEMAs,[dfClosePrice,10,20]) result3 = pool.apply_async(calculateEMAs,[dfClosePrice,20,30]) result4 = pool.apply_async(calculateEMAs,[dfClosePrice,30,40]) result5 = pool.apply_async(calculateEMAs,[dfClosePrice,40,51]) answer1 = result1.get() answer2 = result2.get() answer3 = result3.get() answer4 = result4.get() answer5 = result5.get() print(time.time() - time0) print(dfClosePrice)
I run the function asynchronously with different values for the for loop. this takes 19 seconds to complete and I can see the result of each function printed correctly but the final value of dfClosePirce
is a dataframe with only 1 column (Trade Close) and the new columns from each async function will not be added to the dataframe. How can I do it the right way?
Advertisement
Answer
Solution Using Numpy vectorization
Issue
- Line
if(index-i > 0):
should beif(index-i >= 0):
otherwise we miss the difference of 1 - Use ‘Close’ rather than ‘Trade Close’ (doesn’t matter for performance but avoid renaming column after pulling data from web)
Code
import numpy as np import pandas as pd def compute_using_np(df, start_index, end_index): ''' Using numpy to vectorize computation ''' nrows = len(df) ncols = end_index - start_index # container for pairwise differences pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float) pair_wise_diff.fill(np.nan) # Get values of Trading close column as numpy 1D array values = df['Close'].values # Compute differences for different offsets for offset in range(startIndex, endIndex): # Using numpy to compute vectorized difference (i.e. faster computation) diff = np.abs(values[offset:] - values[:-offset])/2.0 # Update result pair_wise_diff[offset:, offset-startIndex] = diff # Place into DataFrame columns = ["EMA%d"%i for i in range(start_index, end_index)] df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns) # Add result to df merging on index return df.join(df_result)
Usage
df_result = compute_using_np(df, 1, 51)
Performance
Summary
- Posted Code: 37.9 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- Numpy Code: 1.56 ms ± 27.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
- Result: 20K times speed up
Test Code
import pandas_datareader as dr import pandas as pd import numpy as np def calculateEMAs(df, start_index, end_index): ''' Posted code changed 1) use Python PEP 8 naming convention, 2) corrected conditional ''' for index,row in df.iterrows(): for i in range (start_index, end_index): if(index-i >= 0): df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Close"] - df.iloc[index]["Close"])/2 #replace this with EMA formula return df def compute_using_np(df, start_index, end_index): ''' Using numpy to vectorie computation ''' nrows = len(df) ncols = end_index - start_index # container for pairwise differences pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float) pair_wise_diff.fill(np.nan) # Get values of Trading close column as numpy 1D array values = df['Close'].values # Compute differences for different offsets for offset in range(start_index, end_index): # Using numpy to compute vectorized difference (i.e. faster computation) diff = np.abs(values[offset:] - values[:-offset])/2.0 # Update result pair_wise_diff[offset:, offset-start_index] = diff # Place into DataFrame columns = ["EMA%d"%i for i in range(start_index, end_index)] df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns) # Add result to df merging on index return df.join(df_result) # Get ibm closing stock pricing (777 DataFrame rows) df = dr.data.get_data_yahoo('ibm', start = '2017-09-01', end = '2020-10-02') df.reset_index(level=0, inplace = True) # create index which is 0, 1, 2, ... # Time Original post df1 = df.copy() # Copy data since operation is inplace %timeit calculateEMAs(df1, 1, 51) # Jupyter Notebook Magic method # Time Numpy Version %timeit compute_using_np(df, 1, 51) # Jupyter Notebook Magic method # No need to copy since operation is not inplace