Considering that I have CSV files which looks roughly like this
df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'D'],
'ColB': [80, 75, 70, 65]})
I am using the following script which was suggested here
import pandas as pd
import glob
path = r'path/' # use your path
all_files = glob.glob(path + "/*.csv")
fields = ['ColA', 'ColB', 'ColC']
first_one = True
for filename in all_files:
if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file
skip_row = [0]
else:
skip_row = []
# works with this version: '1.3.4'
# combine into one
mode = "w"
header = True
for filename in all_files:
with pd.read_csv(
filename,
engine="python",
iterator=True,
chunksize=10_000,
usecols = fields
) as reader:
for df in reader:
filename = os.path.basename(filename)
df["username"] = filename
df.to_csv("New_File.csv", index=False, mode=mode, header=header)
mode = "a"
header = False
Most of the files have all three columns, while few of them do not have ColC. This will give an error (understandably) which is as follows:
ValueError: Usecols do not match columns, columns expected but not found: ['ColC']
How can I put nan in ColC while keep columns list unchanged?
Advertisement
Answer
Here is one alternative checking the columns beforehand:
# (...)
for filename in all_files:
# Check available columns first
cols = pd.read_csv(filename, engine='python', nrows=0, header=0).columns
fields_ = cols.intersection(fields)
missed = [i for i in fields if i not in cols]
with pd.read_csv(
filename,
engine="python",
iterator=True,
chunksize=10_000,
header=0,
usecols = fields_ # Use the "dynamic" one
) as reader:
for df in reader:
# Manually append missed cols
if missed:
for col in missed:
df[col] = np.nan
# Make sure the order is kept
df = df[fields]
# (proceed...)
filename = os.path.basename(filename)
df["username"] = filename
# (...)