Considering that I have CSV files which looks roughly like this
JavaScript
x
3
1
df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'D'],
2
'ColB': [80, 75, 70, 65]})
3
I am using the following script which was suggested here
JavaScript
1
37
37
1
import pandas as pd
2
import glob
3
4
path = r'path/' # use your path
5
all_files = glob.glob(path + "/*.csv")
6
7
fields = ['ColA', 'ColB', 'ColC']
8
9
first_one = True
10
for filename in all_files:
11
12
if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file
13
skip_row = [0]
14
else:
15
skip_row = []
16
17
# works with this version: '1.3.4'
18
19
# combine into one
20
mode = "w"
21
header = True
22
for filename in all_files:
23
with pd.read_csv(
24
filename,
25
engine="python",
26
iterator=True,
27
chunksize=10_000,
28
usecols = fields
29
) as reader:
30
for df in reader:
31
filename = os.path.basename(filename)
32
df["username"] = filename
33
df.to_csv("New_File.csv", index=False, mode=mode, header=header)
34
mode = "a"
35
header = False
36
37
Most of the files have all three columns, while few of them do not have ColC. This will give an error (understandably) which is as follows:
ValueError: Usecols do not match columns, columns expected but not found: ['ColC']
How can I put nan in ColC
while keep columns
list unchanged?
Advertisement
Answer
Here is one alternative checking the columns beforehand:
JavaScript
1
30
30
1
# (...)
2
for filename in all_files:
3
# Check available columns first
4
cols = pd.read_csv(filename, engine='python', nrows=0, header=0).columns
5
fields_ = cols.intersection(fields)
6
missed = [i for i in fields if i not in cols]
7
8
with pd.read_csv(
9
filename,
10
engine="python",
11
iterator=True,
12
chunksize=10_000,
13
header=0,
14
usecols = fields_ # Use the "dynamic" one
15
) as reader:
16
for df in reader:
17
# Manually append missed cols
18
if missed:
19
for col in missed:
20
df[col] = np.nan
21
22
# Make sure the order is kept
23
df = df[fields]
24
25
# (proceed...)
26
27
filename = os.path.basename(filename)
28
df["username"] = filename
29
# (...)
30