Skip to content
Advertisement

Concat multiple dataframe and manage those that doesn’t exist

I try to concat some dataframe – 30 dataframe of 24h data – that been created automatically with some csv, but sometimes csv doesn’t exist, so the dataframe wasn’t created (df1, fd2, df4,df8,df9,…).

And so I want to create weekly dataframe with 7 concatenated df, but the function is in error when some df is missing. But I want the weekly df created with a empty period if some is missing.

dfweek1 = pd.concat([df1, df2, df3, df4, df5, df6, df7])
dfweek2 = pd.concat([df8, df9, df10, df11, df12, df13, df14])
dfweek3 = pd.concat([df15, df16, df17, df18, df19, df20, df21])
dfweek4 = pd.concat([df22, df23, df24, df25, df26, df27, df28])

When I execute line by line for testing :

dfweek2 = pd.concat([df9, df10, df11, df12, df13, df14, df15])
Traceback (most recent call last):

  File "/tmp/ipykernel_40958/2223069184.py", line 1, in <cell line: 1>
    dfweek2 = pd.concat([df9, df10, df11, df12, df13, df14, df15])

NameError: name 'df14' is not defined

df14 is missing, so dfweek2 is not created.

######################

EDIT : here is all my code for more comprehensive, maybe (probably) I didn’t choose the best way… Ignore all the interpolating things, that for later (maybe).

######################

from datetime import date, timedelta, datetime
import pandas as pd
import argparse
import locale
locale.setlocale(locale.LC_TIME,'')

def get_arguments():

    parser = argparse.ArgumentParser()
    parser.add_argument("--date", help="Start Date in YYYY-MM-DD format", required=False, default=None)
    
    return parser.parse_args()

args = get_arguments()
random_date = args.date

#print(args.date)


startdate=pd.to_datetime(args.date,format="%Y-%m-%d").strftime("%Y-%m-%d")
enddate  = pd.to_datetime(startdate) + pd.DateOffset(months=1)
enddate = pd.to_datetime(enddate,format="%Y-%m-%d").strftime("%Y-%m-%d")

filedate = startdate
file=('consommation_horaire_'+str(filedate)+'.csv')
x = 1
df = {}
#ts = {}
#upsampled = {}
#interpol = {}

#recupération d'un mois de fichiers de 24h
while filedate != enddate :
        try :
            file=('consommation_horaire_'+str(filedate)+'.csv')
            #globals()[f"df{x}"] = pd.read_csv(file, sep=",", on_bad_lines='skip', header=0, parse_dates=["date"], names =["date","kwh"])
            globals()[f"df{x}"] = pd.read_csv(file, sep=",", on_bad_lines='skip', header=0, names =["date","kwh"])
            globals()[f"df{x}"]['date'] = pd.to_datetime(globals()[f"df{x}"]['date'], format='%d/%m/%Y %H:%M')
            globals()[f"df{x}"] = globals()[f"df{x}"].set_index('date')
            #globals()[f"ts{x}"] = globals()[f"df{x}"]['kwh']
            #globals()[f"upsampled{x}"] = globals()[f"ts{x}"].resample('60s')
            #globals()[f"interpol{x}"] = globals()[f"upsampled{x}"].interpolate(method='quadratic', order=2)
            #print(globals()[f"ts{x}"].head(1))
            x += 1
            filedate = pd.to_datetime(filedate,format="%Y-%m-%d") + timedelta(days=1)
            filedate = pd.to_datetime(filedate,format="%Y-%m-%d").strftime("%Y-%m-%d")
        except FileNotFoundError :
            print('fichier ' +file+ ' manquant')
            x += 1
            filedate = pd.to_datetime(filedate,format="%Y-%m-%d") + timedelta(days=1)
            filedate = pd.to_datetime(filedate,format="%Y-%m-%d").strftime("%Y-%m-%d")
        pass

dfweek1 = pd.concat([df1, df2, df3, df4, df5, df6, df7])
dfweek2 = pd.concat([df8, df9, df10, df11, df12, df13, df14])
dfweek3 = pd.concat([df15, df16, df17, df18, df19, df20, df21])
dfweek4 = pd.concat([df22, df23, df24, df25, df26, df27, df28])

Advertisement

Answer

I find something that’s working, even if I think that’s not the best way, here is my code, that a little long :

synthesis : The syntax globals()[f”df{x}”] is pretty useful to create new datarame automatically in loop, “df” is the name of the dataframe and “x” is its number (see below), but you have to create a dictionary first with df = {}.

Note : unlike my first post, in this code I replaced the ‘arg’ def part with a constant value ‘date’, during my development.

#########
date='2021-04-01'

startdate=pd.to_datetime(date,format="%Y-%m-%d").strftime("%Y-%m-%d")
enddate  = pd.to_datetime(startdate) + pd.DateOffset(months=1)
enddate = pd.to_datetime(enddate,format="%Y-%m-%d").strftime("%Y-%m-%d")

#creating dictionaries to generate alldataframe
filedate = startdate
x = 1
df = {}


while filedate != enddate :
    try :
        file=('consommation_horaire_'+str(filedate)+'.csv')
        globals()[f"df{x}"] = pd.read_csv(file, sep=",", on_bad_lines='skip', header=0, names =["date","kwh"])
        globals()[f"df{x}"]['date'] = pd.to_datetime(globals()[f"df{x}"]['date'], format='%d/%m/%Y %H:%M')
        globals()[f"df{x}"] = globals()[f"df{x}"].set_index('date')
        filedate = pd.to_datetime(filedate,format="%Y-%m-%d") + timedelta(days=1)
        filedate = pd.to_datetime(filedate,format="%Y-%m-%d").strftime("%Y-%m-%d")
        x += 1
    except FileNotFoundError :
        print('fichier ' +file+ ' manquant')
        x += 1
        filedate = pd.to_datetime(filedate,format="%Y-%m-%d") + timedelta(days=1)
        filedate = pd.to_datetime(filedate,format="%Y-%m-%d").strftime("%Y-%m-%d")
    pass


#creating an empty dataframe or completing it if doesn't exist or incomplete
df = {}
days=pd.to_datetime(date,format="%Y-%m-%d").days_in_month

for x in range(1,days):
    try:
        if globals()[f"df{x}"].shape[0] < 48 :
            #filling empty lines
            testdate=globals()[f"df{x}"].index[0].strftime("%Y-%m-%d")
            testdate1 = pd.to_datetime(testdate,format="%Y-%m-%d %H:%M:%S")
            testdate2 = pd.to_datetime(testdate,format="%Y-%m-%d %H:%M:%S")+ timedelta(hours=23.5)
            fillindex=pd.date_range(start=testdate1, end=testdate2, freq='30min')
            globals()[f"df{x}"]=globals()[f"df{x}"].reindex(fillindex)
            
       
        else:
            pass
        
    
    except :
        globals()[f"df{x}"] = pd.DataFrame(index=range(48),columns=range(1))
        globals()[f"df{x}"].columns = ['kwh']
        testdate=pd.to_datetime(date,format="%Y-%m-%d %H:%M:%S")+ timedelta(days=x-1)
        testdate1 = pd.to_datetime(testdate,format="%Y-%m-%d %H:%M:%S")
        testdate2 = pd.to_datetime(testdate,format="%Y-%m-%d %H:%M:%S")+ timedelta(hours=23.5)
        fillindex=pd.date_range(start=testdate1, end=testdate2, freq='30min')
        globals()[f"df{x}"].index.names = ['date']
        globals()[f"df{x}"].index = pd.DatetimeIndex(fillindex)
            
        pass
User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement