Considering that I have CSV files which looks roughly like this
df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'D'], 'ColB': [80, 75, 70, 65]})
I am using the following script which was suggested here
import pandas as pd import glob path = r'path/' # use your path all_files = glob.glob(path + "/*.csv") fields = ['ColA', 'ColB', 'ColC'] first_one = True for filename in all_files: if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file skip_row = [0] else: skip_row = [] # works with this version: '1.3.4' # combine into one mode = "w" header = True for filename in all_files: with pd.read_csv( filename, engine="python", iterator=True, chunksize=10_000, usecols = fields ) as reader: for df in reader: filename = os.path.basename(filename) df["username"] = filename df.to_csv("New_File.csv", index=False, mode=mode, header=header) mode = "a" header = False
Most of the files have all three columns, while few of them do not have ColC. This will give an error (understandably) which is as follows:
ValueError: Usecols do not match columns, columns expected but not found: ['ColC']
How can I put nan in ColC
while keep columns
list unchanged?
Advertisement
Answer
Here is one alternative checking the columns beforehand:
# (...) for filename in all_files: # Check available columns first cols = pd.read_csv(filename, engine='python', nrows=0, header=0).columns fields_ = cols.intersection(fields) missed = [i for i in fields if i not in cols] with pd.read_csv( filename, engine="python", iterator=True, chunksize=10_000, header=0, usecols = fields_ # Use the "dynamic" one ) as reader: for df in reader: # Manually append missed cols if missed: for col in missed: df[col] = np.nan # Make sure the order is kept df = df[fields] # (proceed...) filename = os.path.basename(filename) df["username"] = filename # (...)