Skip to content
Advertisement

I want to select data from different df, how can I speed it up?

I want to take the last data before the specified time from different time intervals df, my code is as follows:

import numpy as np
import datetime

import pandas as pd

np.random.seed(2022)
durations = ['T', '5T', '15T', '30T', 'H', '2H', 'D', 'W', 'BM']
datas = {}
time_selected = None


def generate_data():
    global durations, datas
    start_dt = '2018-01-01'
    end_dt = '2022-05-02'
    for duration in durations:
        datas[duration] = pd.DataFrame(index=pd.date_range(start_dt, end_dt, freq=duration))
        datas[duration]['duration'] = duration
        datas[duration]['data'] = np.random.random(len(datas[duration])) * 100

    return


def selecte_time():
    global time_selected
    start_dt = datetime.datetime(2018, 3, 1)
    end_dt = datetime.datetime(2022, 5, 2)
    idx = pd.date_range(start_dt, end_dt, freq='T')
    time_selected = np.random.choice(idx)
    return time_selected


def get_result_df():
    global durations, datas, time_selected
    t_df = {}
    col = ['duration', 'data']
    for duration in durations:
        df = datas[duration]
        t_df[duration] = df[df.index <= time_selected][col].iloc[-1]
    df = pd.DataFrame(t_df[duration] for duration in durations)

    return df


def main():
    generate_data()
    selecte_time()
    df = get_result_df()
    print(df)


if __name__ == '__main__':
    main()

On my computer, the running time of get_result_df() is 204ms, how can I speed up the running speed of get_result_df()?

%timeit get_result_df()
204 ms ± 4.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

I optimized it, and the running time was reduced to 53ms. Is there any room for improvement?

def get_result_df():
    global durations, datas, time_selected
    t_df = {}
    col = ['duration', 'data']
    for duration in durations:
        df = datas[duration]
        dt = df.index.to_numpy()
        dt1 = dt[dt <= time_selected][-1]
        t_df[duration] = df[df.index == dt1][col].iloc[-1]
    df = pd.DataFrame(t_df[duration] for duration in durations)
    return df
%timeit get_result_df()
53.3 ms ± 7.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Answers to my questions on code view SE:

%timeit get_result_df(datas, time_selected)
5.81 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Advertisement

Answer

My times are roughly halved, but I see the same behavior. Faster using argmin from np. See below.

In [1]: %timeit get_result_df()
115 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [2]: %timeit get_result_df2()
26.2 ms ± 387 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Argmin + iloc directly it is faster:

def get_result_df3():
    global durations, datas, time_selected
    t_df = {}
    col = ['duration', 'data']
    for duration in durations:
        df = datas[duration]
        dt = df.index.to_numpy()
        idx = np.argmin([dt <= time_selected])-1
        t_df[duration] = df.iloc[idx][col]
    df = pd.DataFrame(t_df[duration] for duration in durations)
    return df

In [2]: %timeit get_result_df3()
9.62 ms ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

User contributions licensed under: CC BY-SA
9 People found this is helpful
Advertisement