I have a large time-indexed Pandas DataFrame
with time-series data of a couple of devices. The structure of this DataFrame
(in code below self._combined_data_frame
) looks like this:
DateTimeIndex|device_name|col1|...|colN
The DateTimeIndex
and device_name
are filled for every row, the other columns contain nan
values. Sample data is available on Google Drive:
Then there is a list with reference timestamps from another source (in code below self._lastinjection_items
). What I would like to do is to add to each item in the DataFrame
a reference timestamp such that to each timestamped item the “next smaller” reference timestamp is added. Pseudocode:
DataFrame['reference_timestamp'] = (reference_timestamps <= DataFrame.timestamp)[0]
Currently I tried to use the DataFrame.apply(lambda item: ...)
function, but I can’t even figure out how to access the timestamp of the individual items.
What would be the most Pythonic way to realize what I want?
Current code looks like this:
from cmath import inf import numpy as np import pandas as pd from pandas import DataFrame class IonSourceDataProcessor: _combined_data_frame: DataFrame = DataFrame() _input_file_name: str _lastinjection_items: [] def __init__(self, input_file_name): self._input_file_name = input_file_name def build_injection_marker(self, item): ... here I try to implement code so I get a result frame with the structure: DateTimeIndex|device_name|col1|...|colN|reference_timestamp def add_lastinjection_markers(self): ...work in progress, do not take this serious... for item in zip(self._combined_data_frame.values, self._combined_data_frame.index, self._combined_data_frame['nomen']): self.build_injection_marker(item) def get_data_grouped(self, groupby: str, axis: str or int = 0): return self._combined_data_frame.groupby(groupby, axis=axis) def integrate_trafo_contacq_rawdata(self, device_name: str, groupby_column: str, drop_columns: list): print('opening:', device_name) device_dataset: DataFrame = DataFrame(pd.read_hdf(self._input_file_name, key=device_name)).groupby(groupby_column).sum() device_dataset = device_dataset.drop(columns=drop_columns) for column in device_dataset.columns: device_dataset = device_dataset.rename(columns={column: '_'.join([column, 'sum'])}) self._lastinjection_items = pd.to_datetime(device_dataset.index.values) device_dataset['device_name'] = '_'.join([device_name.split('_')[0], 'integral']) device_dataset.index = pd.to_datetime(device_dataset.index) self._combined_data_frame = pd.concat([self._combined_data_frame, device_dataset]) return self def read_device_dataset(self, device_name: str): print('opening:', device_name) device_dataset: DataFrame = DataFrame(pd.read_hdf(self._input_file_name, key=device_name)) device_dataset['device_name'] = device_name # data_set = data_set.reset_index(drop=True) # data_set['time'] = pd.to_datetime(data_set['time']) # data_set = data_set.set_index('time') if ('current' in device_dataset.columns) and ('voltage' in device_dataset.columns): device_dataset['power'] = (device_dataset['voltage'] * device_dataset['current']) device_dataset['resistance'] = (device_dataset['voltage'] / device_dataset['current']) device_dataset['resistance'] = device_dataset['resistance'].replace([inf, -inf, np.nan], 0.0) if 'currentset' in device_dataset.columns: device_dataset['currentset_gradient'] = (np.gradient(device_dataset['currentset'])) self._combined_data_frame = pd.concat([self._combined_data_frame, device_dataset]) return self if __name__ == '__main__': processor = IonSourceDataProcessor('test_data.h5') processor = processor.integrate_trafo_contacq_rawdata('YR11DX1S1_ContAcq', 'lastinjection', ['lastextraction']) device_names = ['YRT1IN1E'] for device_name in device_names: processor = processor.read_device_dataset(device_name) processor.add_lastinjection_markers() ...
EDIT:
print (DataFrame) device_name col1 col2 ... DateTimeIndex 2016-11-30 A NaN 0.1 ... 2017-04-30 A NaN 0.2 ... 2018-01-31 A NaN 0.1 ... 2019-09-30 A NaN 0.3 ... 2020-04-30 A NaN 0.4 ... 2020-11-30 A NaN 0.2 ... reference_timestamps = ['2017-12-31','2019-01-31','2020-12-31']
Expected result:
print(DataFrame) device_name col1 col2 ... ref_timestamp DateTimeIndex 2016-11-30 A NaN 0.1 ... '2017-12-31' 2017-04-30 A NaN 0.2 ... '2017-12-31' 2018-01-31 A NaN 0.1 ... '2019-01-31' 2019-09-30 A NaN 0.3 ... '2020-12-31' 2020-04-30 A NaN 0.4 ... '2020-12-31' 2020-11-30 A NaN 0.2 ... '2020-12-31'
Advertisement
Answer
IIUC use merge_asof
:
reference_timestamps = ['2017-12-31','2019-01-31','2020-12-31'] df1 = pd.DataFrame({'reference_timestamp': pd.to_datetime(reference_timestamps)}) DataFrame = pd.merge_asof(DataFrame.reset_index(), df1, left_on='DateTimeIndex', right_on='reference_timestamp', direction='forward') print (DataFrame) DateTimeIndex device_name col reference_timestamp 0 2016-11-30 A NaN 2017-12-31 1 2017-04-30 A NaN 2017-12-31 2 2018-01-31 A NaN 2019-01-31 3 2019-09-30 A NaN 2020-12-31 4 2020-04-30 A NaN 2020-12-31 5 2020-11-30 A NaN 2020-12-31