While merging 100+ CSV files, how to fill nan in a column if it doesn’t exist in “usecol”?

Considering that I have CSV files which looks roughly like this

df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'D'],
                   'ColB': [80, 75, 70, 65]})

JavaScript
​x
 
df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'D'],
                   'ColB': [80, 75, 70, 65]})
​

I am using the following script which was suggested here

import pandas as pd
import glob

path = r'path/' # use your path
all_files = glob.glob(path + "/*.csv")

fields = ['ColA', 'ColB', 'ColC']

first_one = True
for filename in all_files:

    if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file
        skip_row = [0]
    else:
        skip_row = []

# works with this version: '1.3.4'

# combine into one
mode = "w"
header = True
for filename in all_files:
    with pd.read_csv(
        filename,
        engine="python",
        iterator=True,
        chunksize=10_000,
        usecols = fields
    ) as reader:
        for df in reader:
            filename = os.path.basename(filename)
            df["username"] = filename
            df.to_csv("New_File.csv", index=False, mode=mode, header=header)
            mode = "a"
            header = False

JavaScript
 
import pandas as pd
import glob
​
path = r'path/' # use your path
all_files = glob.glob(path + "/*.csv")
​
fields = ['ColA', 'ColB', 'ColC']
​
first_one = True
for filename in all_files:
​
    if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file
        skip_row = [0]
    else:
        skip_row = []
​
# works with this version: '1.3.4'
​
# combine into one
mode = "w"
header = True
for filename in all_files:
    with pd.read_csv(
        filename,
        engine="python",
        iterator=True,
        chunksize=10_000,
        usecols = fields
    ) as reader:
        for df in reader:
            filename = os.path.basename(filename)
            df["username"] = filename
            df.to_csv("New_File.csv", index=False, mode=mode, header=header)
            mode = "a"
            header = False
​
​

Most of the files have all three columns, while few of them do not have ColC. This will give an error (understandably) which is as follows:

ValueError: Usecols do not match columns, columns expected but not found: ['ColC']

How can I put nan in ColC while keep columns list unchanged?

Answer

Here is one alternative checking the columns beforehand:

# (...)
for filename in all_files:
    # Check available columns first
    cols = pd.read_csv(filename, engine='python', nrows=0, header=0).columns
    fields_ = cols.intersection(fields)
    missed = [i for i in fields if i not in cols]
    
    with pd.read_csv(
        filename,
        engine="python",
        iterator=True,
        chunksize=10_000,
        header=0,
        usecols = fields_  # Use the "dynamic" one
    ) as reader:
        for df in reader:
            # Manually append missed cols
            if missed:
                 for col in missed:
                    df[col] = np.nan

                # Make sure the order is kept
                df = df[fields]

            # (proceed...)

            filename = os.path.basename(filename)
            df["username"] = filename
# (...)

JavaScript
 
# (...)
for filename in all_files:
    # Check available columns first
    cols = pd.read_csv(filename, engine='python', nrows=0, header=0).columns
    fields_ = cols.intersection(fields)
    missed = [i for i in fields if i not in cols]
    
    with pd.read_csv(
        filename,
        engine="python",
        iterator=True,
        chunksize=10_000,
        header=0,
        usecols = fields_  # Use the "dynamic" one
    ) as reader:
        for df in reader:
            # Manually append missed cols
            if missed:
                 for col in missed:
                    df[col] = np.nan
​
                # Make sure the order is kept
                df = df[fields]
​
            # (proceed...)
​
            filename = os.path.basename(filename)
            df["username"] = filename
# (...)
​

Advertisement

Answer