I have a large time-indexed Pandas DataFrame with time-series data of a couple of devices. The structure of this DataFrame (in code below self._combined_data_frame) looks like this: The DateTimeIndex and device_name are filled for every row, the other columns contain nan values. Sample data is available on Google Drive: Sample data set Then there is a list with reference timestamps

Add additional timestamp to Pandas DataFrame items based on item timestamp/index

I have a large time-indexed Pandas DataFrame with time-series data of a couple of devices. The structure of this DataFrame (in code below self._combined_data_frame) looks like this:

DateTimeIndex|device_name|col1|...|colN

JavaScript
​x
 
DateTimeIndex|device_name|col1|...|colN
​

The DateTimeIndex and device_name are filled for every row, the other columns contain nan values. Sample data is available on Google Drive:

Sample data set

Then there is a list with reference timestamps from another source (in code below self._lastinjection_items). What I would like to do is to add to each item in the DataFrame a reference timestamp such that to each timestamped item the “next smaller” reference timestamp is added. Pseudocode:

DataFrame['reference_timestamp'] = (reference_timestamps <= DataFrame.timestamp)[0]

JavaScript
 
DataFrame['reference_timestamp'] = (reference_timestamps <= DataFrame.timestamp)[0]
​

Currently I tried to use the DataFrame.apply(lambda item: ...) function, but I can’t even figure out how to access the timestamp of the individual items.

What would be the most Pythonic way to realize what I want?

Current code looks like this:

from cmath import inf
import numpy as np
import pandas as pd
from pandas import DataFrame

class IonSourceDataProcessor:

    _combined_data_frame: DataFrame = DataFrame()
    _input_file_name: str
    _lastinjection_items: []

    def __init__(self, input_file_name):
        self._input_file_name = input_file_name

    def build_injection_marker(self, item):
        ... here I try to implement code so I get a result frame with the structure:
        DateTimeIndex|device_name|col1|...|colN|reference_timestamp

    def add_lastinjection_markers(self):
        ...work in progress, do not take this serious...
        for item in zip(self._combined_data_frame.values, self._combined_data_frame.index, self._combined_data_frame['nomen']):
            self.build_injection_marker(item)


    def get_data_grouped(self, groupby: str, axis: str or int = 0):
        return self._combined_data_frame.groupby(groupby, axis=axis)

    def integrate_trafo_contacq_rawdata(self, device_name: str, groupby_column: str, drop_columns: list):
        print('opening:', device_name)
        device_dataset: DataFrame = DataFrame(pd.read_hdf(self._input_file_name, key=device_name)).groupby(groupby_column).sum()
        device_dataset = device_dataset.drop(columns=drop_columns)
        for column in device_dataset.columns:
            device_dataset = device_dataset.rename(columns={column: '_'.join([column, 'sum'])})

        self._lastinjection_items = pd.to_datetime(device_dataset.index.values)

        device_dataset['device_name'] = '_'.join([device_name.split('_')[0], 'integral'])
        device_dataset.index = pd.to_datetime(device_dataset.index)

        self._combined_data_frame = pd.concat([self._combined_data_frame, device_dataset])

        return self

    def read_device_dataset(self, device_name: str):
        print('opening:', device_name)
        device_dataset: DataFrame = DataFrame(pd.read_hdf(self._input_file_name, key=device_name))
        device_dataset['device_name'] = device_name
        # data_set = data_set.reset_index(drop=True)
        # data_set['time'] = pd.to_datetime(data_set['time'])
        # data_set = data_set.set_index('time')

        if ('current' in device_dataset.columns) and ('voltage' in device_dataset.columns):
            device_dataset['power'] = (device_dataset['voltage'] * device_dataset['current'])
            device_dataset['resistance'] = (device_dataset['voltage'] / device_dataset['current'])
            device_dataset['resistance'] = device_dataset['resistance'].replace([inf, -inf, np.nan], 0.0)
        if 'currentset' in device_dataset.columns:
            device_dataset['currentset_gradient'] = (np.gradient(device_dataset['currentset']))

        self._combined_data_frame = pd.concat([self._combined_data_frame, device_dataset])

        return self


if __name__ == '__main__':

    processor = IonSourceDataProcessor('test_data.h5')

    processor = processor.integrate_trafo_contacq_rawdata('YR11DX1S1_ContAcq', 'lastinjection', ['lastextraction'])

    device_names = ['YRT1IN1E']
    for device_name in device_names:
        processor = processor.read_device_dataset(device_name)

    processor.add_lastinjection_markers()
    
    ...

JavaScript
 
from cmath import inf
import numpy as np
import pandas as pd
from pandas import DataFrame
​
class IonSourceDataProcessor:
​
    _combined_data_frame: DataFrame = DataFrame()
    _input_file_name: str
    _lastinjection_items: []
​
    def __init__(self, input_file_name):
        self._input_file_name = input_file_name
​
    def build_injection_marker(self, item):
        ... here I try to implement code so I get a result frame with the structure:
        DateTimeIndex|device_name|col1|...|colN|reference_timestamp
​
    def add_lastinjection_markers(self):
        ...work in progress, do not take this serious...
        for item in zip(self._combined_data_frame.values, self._combined_data_frame.index, self._combined_data_frame['nomen']):
            self.build_injection_marker(item)
​
​
    def get_data_grouped(self, groupby: str, axis: str or int = 0):
        return self._combined_data_frame.groupby(groupby, axis=axis)
​
    def integrate_trafo_contacq_rawdata(self, device_name: str, groupby_column: str, drop_columns: list):
        print('opening:', device_name)
        device_dataset: DataFrame = DataFrame(pd.read_hdf(self._input_file_name, key=device_name)).groupby(groupby_column).sum()
        device_dataset = device_dataset.drop(columns=drop_columns)
        for column in device_dataset.columns:
            device_dataset = device_dataset.rename(columns={column: '_'.join([column, 'sum'])})
​
        self._lastinjection_items = pd.to_datetime(device_dataset.index.values)
​
        device_dataset['device_name'] = '_'.join([device_name.split('_')[0], 'integral'])
        device_dataset.index = pd.to_datetime(device_dataset.index)
​
        self._combined_data_frame = pd.concat([self._combined_data_frame, device_dataset])
​
        return self
​
    def read_device_dataset(self, device_name: str):
        print('opening:', device_name)
        device_dataset: DataFrame = DataFrame(pd.read_hdf(self._input_file_name, key=device_name))
        device_dataset['device_name'] = device_name
        # data_set = data_set.reset_index(drop=True)
        # data_set['time'] = pd.to_datetime(data_set['time'])
        # data_set = data_set.set_index('time')
​
        if ('current' in device_dataset.columns) and ('voltage' in device_dataset.columns):
            device_dataset['power'] = (device_dataset['voltage'] * device_dataset['current'])
            device_dataset['resistance'] = (device_dataset['voltage'] / device_dataset['current'])
            device_dataset['resistance'] = device_dataset['resistance'].replace([inf, -inf, np.nan], 0.0)
        if 'currentset' in device_dataset.columns:
            device_dataset['currentset_gradient'] = (np.gradient(device_dataset['currentset']))
​
        self._combined_data_frame = pd.concat([self._combined_data_frame, device_dataset])
​
        return self
​
​
if __name__ == '__main__':
​
    processor = IonSourceDataProcessor('test_data.h5')
​
    processor = processor.integrate_trafo_contacq_rawdata('YR11DX1S1_ContAcq', 'lastinjection', ['lastextraction'])
​
    device_names = ['YRT1IN1E']
    for device_name in device_names:
        processor = processor.read_device_dataset(device_name)
​
    processor.add_lastinjection_markers()
    
    ...
​

EDIT:

print (DataFrame)
              device_name  col1  col2 ...
DateTimeIndex                 
2016-11-30              A  NaN   0.1  ...
2017-04-30              A  NaN   0.2  ...  
2018-01-31              A  NaN   0.1  ...
2019-09-30              A  NaN   0.3  ...
2020-04-30              A  NaN   0.4  ...
2020-11-30              A  NaN   0.2  ...

reference_timestamps = ['2017-12-31','2019-01-31','2020-12-31']

JavaScript
 
print (DataFrame)
              device_name  col1  col2 ...
DateTimeIndex                 
2016-11-30              A  NaN   0.1  ...
2017-04-30              A  NaN   0.2  ...  
2018-01-31              A  NaN   0.1  ...
2019-09-30              A  NaN   0.3  ...
2020-04-30              A  NaN   0.4  ...
2020-11-30              A  NaN   0.2  ...
​
reference_timestamps = ['2017-12-31','2019-01-31','2020-12-31']
​

Expected result:

print(DataFrame)

              device_name  col1  col2 ...  ref_timestamp
DateTimeIndex
2016-11-30              A  NaN   0.1  ...  '2017-12-31'
2017-04-30              A  NaN   0.2  ...  '2017-12-31'
2018-01-31              A  NaN   0.1  ...  '2019-01-31'
2019-09-30              A  NaN   0.3  ...  '2020-12-31'
2020-04-30              A  NaN   0.4  ...  '2020-12-31'
2020-11-30              A  NaN   0.2  ...  '2020-12-31'

JavaScript
 
print(DataFrame)
​
              device_name  col1  col2 ...  ref_timestamp
DateTimeIndex
2016-11-30              A  NaN   0.1  ...  '2017-12-31'
2017-04-30              A  NaN   0.2  ...  '2017-12-31'
2018-01-31              A  NaN   0.1  ...  '2019-01-31'
2019-09-30              A  NaN   0.3  ...  '2020-12-31'
2020-04-30              A  NaN   0.4  ...  '2020-12-31'
2020-11-30              A  NaN   0.2  ...  '2020-12-31'
​

In real life:

Answer

IIUC use merge_asof:

reference_timestamps = ['2017-12-31','2019-01-31','2020-12-31']

df1 = pd.DataFrame({'reference_timestamp': pd.to_datetime(reference_timestamps)})

DataFrame = pd.merge_asof(DataFrame.reset_index(), 
                          df1, 
                          left_on='DateTimeIndex', 
                          right_on='reference_timestamp',
                          direction='forward')
print (DataFrame)
  DateTimeIndex device_name  col reference_timestamp
0    2016-11-30           A  NaN          2017-12-31
1    2017-04-30           A  NaN          2017-12-31
2    2018-01-31           A  NaN          2019-01-31
3    2019-09-30           A  NaN          2020-12-31
4    2020-04-30           A  NaN          2020-12-31
5    2020-11-30           A  NaN          2020-12-31

JavaScript
 
reference_timestamps = ['2017-12-31','2019-01-31','2020-12-31']
​
df1 = pd.DataFrame({'reference_timestamp': pd.to_datetime(reference_timestamps)})
​
DataFrame = pd.merge_asof(DataFrame.reset_index(), 
                          df1, 
                          left_on='DateTimeIndex', 
                          right_on='reference_timestamp',
                          direction='forward')
print (DataFrame)
  DateTimeIndex device_name  col reference_timestamp
0    2016-11-30           A  NaN          2017-12-31
1    2017-04-30           A  NaN          2017-12-31
2    2018-01-31           A  NaN          2019-01-31
3    2019-09-30           A  NaN          2020-12-31
4    2020-04-30           A  NaN          2020-12-31
5    2020-11-30           A  NaN          2020-12-31
​

Advertisement

Answer