I have the following for function:
JavaScript
x
8
1
def calculateEMAs(df,startIndex,endIndex):
2
for index,row in df.iterrows():
3
for i in range (1,51):
4
if(index-i > 0):
5
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
6
print(df)
7
8
This for loop takes a long time to calculate the values for the data frame as it has to loop 50 times for each row (it takes approximately 62 seconds)
I tried to use multiprocessor pool from this question. My code looks like this now:
JavaScript
1
24
24
1
def calculateEMAs(df,startIndex,endIndex):
2
for index,row in df.iterrows():
3
for i in range (startIndex,endIndex):
4
if(index-i > 0):
5
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
6
print(df)
7
8
def main():
9
dfClosePrice= getFileDataframe().to_frame()
10
pool = Pool()
11
time0 = time.time()
12
result1 = pool.apply_async(calculateEMAs,[dfClosePrice,1,10])
13
result2 = pool.apply_async(calculateEMAs,[dfClosePrice,10,20])
14
result3 = pool.apply_async(calculateEMAs,[dfClosePrice,20,30])
15
result4 = pool.apply_async(calculateEMAs,[dfClosePrice,30,40])
16
result5 = pool.apply_async(calculateEMAs,[dfClosePrice,40,51])
17
answer1 = result1.get()
18
answer2 = result2.get()
19
answer3 = result3.get()
20
answer4 = result4.get()
21
answer5 = result5.get()
22
print(time.time() - time0)
23
print(dfClosePrice)
24
I run the function asynchronously with different values for the for loop. this takes 19 seconds to complete and I can see the result of each function printed correctly but the final value of dfClosePirce
is a dataframe with only 1 column (Trade Close) and the new columns from each async function will not be added to the dataframe. How can I do it the right way?
Advertisement
Answer
Solution Using Numpy vectorization
Issue
- Line
if(index-i > 0):
should beif(index-i >= 0):
otherwise we miss the difference of 1 - Use ‘Close’ rather than ‘Trade Close’ (doesn’t matter for performance but avoid renaming column after pulling data from web)
Code
JavaScript
1
32
32
1
import numpy as np
2
import pandas as pd
3
4
def compute_using_np(df, start_index, end_index):
5
'''
6
Using numpy to vectorize computation
7
'''
8
nrows = len(df)
9
ncols = end_index - start_index
10
11
# container for pairwise differences
12
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
13
pair_wise_diff.fill(np.nan)
14
# Get values of Trading close column as numpy 1D array
15
values = df['Close'].values
16
17
# Compute differences for different offsets
18
for offset in range(startIndex, endIndex):
19
# Using numpy to compute vectorized difference (i.e. faster computation)
20
diff = np.abs(values[offset:] - values[:-offset])/2.0
21
22
# Update result
23
pair_wise_diff[offset:, offset-startIndex] = diff
24
25
# Place into DataFrame
26
columns = ["EMA%d"%i for i in range(start_index, end_index)]
27
28
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
29
30
# Add result to df merging on index
31
return df.join(df_result)
32
Usage
JavaScript
1
2
1
df_result = compute_using_np(df, 1, 51)
2
Performance
Summary
- Posted Code: 37.9 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- Numpy Code: 1.56 ms ± 27.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
- Result: 20K times speed up
Test Code
JavaScript
1
57
57
1
import pandas_datareader as dr
2
import pandas as pd
3
import numpy as np
4
5
def calculateEMAs(df, start_index, end_index):
6
'''
7
Posted code changed 1) use Python PEP 8 naming convention,
8
2) corrected conditional
9
'''
10
for index,row in df.iterrows():
11
for i in range (start_index, end_index):
12
if(index-i >= 0):
13
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Close"] - df.iloc[index]["Close"])/2 #replace this with EMA formula
14
return df
15
16
def compute_using_np(df, start_index, end_index):
17
'''
18
Using numpy to vectorie computation
19
'''
20
nrows = len(df)
21
22
ncols = end_index - start_index
23
24
# container for pairwise differences
25
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
26
pair_wise_diff.fill(np.nan)
27
# Get values of Trading close column as numpy 1D array
28
values = df['Close'].values
29
30
# Compute differences for different offsets
31
for offset in range(start_index, end_index):
32
# Using numpy to compute vectorized difference (i.e. faster computation)
33
diff = np.abs(values[offset:] - values[:-offset])/2.0
34
35
# Update result
36
pair_wise_diff[offset:, offset-start_index] = diff
37
38
# Place into DataFrame
39
columns = ["EMA%d"%i for i in range(start_index, end_index)]
40
41
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
42
43
# Add result to df merging on index
44
return df.join(df_result)
45
46
# Get ibm closing stock pricing (777 DataFrame rows)
47
df = dr.data.get_data_yahoo('ibm', start = '2017-09-01', end = '2020-10-02')
48
df.reset_index(level=0, inplace = True) # create index which is 0, 1, 2, ...
49
50
# Time Original post
51
df1 = df.copy() # Copy data since operation is inplace
52
%timeit calculateEMAs(df1, 1, 51) # Jupyter Notebook Magic method
53
54
# Time Numpy Version
55
%timeit compute_using_np(df, 1, 51) # Jupyter Notebook Magic method
56
# No need to copy since operation is not inplace
57