I want to take the last data before the specified time from different time intervals df, my code is as follows:
import numpy as np
import datetime
import pandas as pd
np.random.seed(2022)
durations = ['T', '5T', '15T', '30T', 'H', '2H', 'D', 'W', 'BM']
datas = {}
time_selected = None
def generate_data():
global durations, datas
start_dt = '2018-01-01'
end_dt = '2022-05-02'
for duration in durations:
datas[duration] = pd.DataFrame(index=pd.date_range(start_dt, end_dt, freq=duration))
datas[duration]['duration'] = duration
datas[duration]['data'] = np.random.random(len(datas[duration])) * 100
return
def selecte_time():
global time_selected
start_dt = datetime.datetime(2018, 3, 1)
end_dt = datetime.datetime(2022, 5, 2)
idx = pd.date_range(start_dt, end_dt, freq='T')
time_selected = np.random.choice(idx)
return time_selected
def get_result_df():
global durations, datas, time_selected
t_df = {}
col = ['duration', 'data']
for duration in durations:
df = datas[duration]
t_df[duration] = df[df.index <= time_selected][col].iloc[-1]
df = pd.DataFrame(t_df[duration] for duration in durations)
return df
def main():
generate_data()
selecte_time()
df = get_result_df()
print(df)
if __name__ == '__main__':
main()
On my computer, the running time of get_result_df() is 204ms, how can I speed up the running speed of get_result_df()?
%timeit get_result_df() 204 ms ± 4.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
I optimized it, and the running time was reduced to 53ms. Is there any room for improvement?
def get_result_df():
global durations, datas, time_selected
t_df = {}
col = ['duration', 'data']
for duration in durations:
df = datas[duration]
dt = df.index.to_numpy()
dt1 = dt[dt <= time_selected][-1]
t_df[duration] = df[df.index == dt1][col].iloc[-1]
df = pd.DataFrame(t_df[duration] for duration in durations)
return df
%timeit get_result_df() 53.3 ms ± 7.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Answers to my questions on code view SE:
%timeit get_result_df(datas, time_selected) 5.81 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Advertisement
Answer
My times are roughly halved, but I see the same behavior. Faster using argmin from np. See below.
In [1]: %timeit get_result_df() 115 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [2]: %timeit get_result_df2() 26.2 ms ± 387 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Argmin + iloc directly it is faster:
def get_result_df3():
global durations, datas, time_selected
t_df = {}
col = ['duration', 'data']
for duration in durations:
df = datas[duration]
dt = df.index.to_numpy()
idx = np.argmin([dt <= time_selected])-1
t_df[duration] = df.iloc[idx][col]
df = pd.DataFrame(t_df[duration] for duration in durations)
return df
In [2]: %timeit get_result_df3()
9.62 ms ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)