How to make this code not to consume so much RAM memory?

I have these two function and when I run them my kernel dies so freaking quickly. What can I do to prevent it? It happens after appending about 10 files to the dataframe. Unfortunately json files are such big (approx. 150 MB per one, having dozens of them) and I have no idea how to join it together.

import os
import pandas as pd
from pandas.io.json import json_normalize
import json

def filtering_nodes(df):
    id_list = df.index.tolist()
    print("Dropping rows without 4 nodes and 3 members...")
    for x in id_list:
        if len(df['Nodes'][x]) != 4 and len(df['Members'][x]) != 3:
            df = df.drop(x)
    print("Converting to csv...")
    df.to_csv("whole_df.csv", sep='t')
    return df

def merge_JsonFiles(filename):
    result = list()
    cnt = 0
    
    df_all = None
    data_all = None
    
    for f1 in filename:
        print("Appending file: ", f1)
        with open('../../data' + f1, 'r') as infile:
            data_all = json.loads(infile.read())
        if cnt == 0:
            df_all = pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-")
        else:
            df_all = df_all.append(pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-"), ignore_index = True)
        cnt += 1
        
    return df_all

files = os.listdir('../../data')
df_all_test = merge_JsonFiles(files)
df_all_test_drop = filtering_nodes(df_all_test)

JavaScript
​x
 
import os
import pandas as pd
from pandas.io.json import json_normalize
import json
​
def filtering_nodes(df):
    id_list = df.index.tolist()
    print("Dropping rows without 4 nodes and 3 members...")
    for x in id_list:
        if len(df['Nodes'][x]) != 4 and len(df['Members'][x]) != 3:
            df = df.drop(x)
    print("Converting to csv...")
    df.to_csv("whole_df.csv", sep='t')
    return df
​
def merge_JsonFiles(filename):
    result = list()
    cnt = 0
    
    df_all = None
    data_all = None
    
    for f1 in filename:
        print("Appending file: ", f1)
        with open('../../data' + f1, 'r') as infile:
            data_all = json.loads(infile.read())
        if cnt == 0:
            df_all = pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-")
        else:
            df_all = df_all.append(pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-"), ignore_index = True)
        cnt += 1
        
    return df_all
​
files = os.listdir('../../data')
df_all_test = merge_JsonFiles(files)
df_all_test_drop = filtering_nodes(df_all_test)
​

EDIT: Due to @jlandercy answer, I’ve made this:

def merging_to_csv():
    for path in pathlib.Path("../../data/loads_data/Dane/hilti/").glob("*.json"):
        # Open source file one by one:
        with path.open() as handler:
            df = pd.json_normalize(json.load(handler), record_path =['List2D'])
        # Identify rows to drop (boolean indexing):
        q = (df["Nodes"] != 4) & (df["Members"] != 3)
        # Inplace drop (no extra copy in RAM):
        df.drop(q, inplace=True)
        # Append data to disk instead of RAM:
        df.to_csv("output.csv", mode="a", header=False)

merging_to_csv()

JavaScript
 
def merging_to_csv():
    for path in pathlib.Path("../../data/loads_data/Dane/hilti/").glob("*.json"):
        # Open source file one by one:
        with path.open() as handler:
            df = pd.json_normalize(json.load(handler), record_path =['List2D'])
        # Identify rows to drop (boolean indexing):
        q = (df["Nodes"] != 4) & (df["Members"] != 3)
        # Inplace drop (no extra copy in RAM):
        df.drop(q, inplace=True)
        # Append data to disk instead of RAM:
        df.to_csv("output.csv", mode="a", header=False)
​
merging_to_csv()
​

and I have this type of error:

KeyError                                  Traceback (most recent call last)
<ipython-input-55-cf18265ca50e> in <module>
----> 1 merging_to_csv()

<ipython-input-54-698c67461b34> in merging_to_csv()
     51         q = (df["Nodes"] != 4) & (df["Members"] != 3)
     52         # Inplace drop (no extra copy in RAM):
---> 53         df.drop(q, inplace=True)
     54         # Append data to disk instead of RAM:
     55         df.to_csv("output.csv", mode="a", header=False)

/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4906             level=level,
   4907             inplace=inplace,
-> 4908             errors=errors,
   4909         )
   4910 

/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4148         for axis, labels in axes.items():
   4149             if labels is not None:
-> 4150                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4151 
   4152         if inplace:

/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
   4183                 new_axis = axis.drop(labels, level=level, errors=errors)
   4184             else:
-> 4185                 new_axis = axis.drop(labels, errors=errors)
   4186             result = self.reindex(**{axis_name: new_axis})
   4187 

/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
   6016         if mask.any():
   6017             if errors != "ignore":
-> 6018                 raise KeyError(f"{labels[mask]} not found in axis")
   6019             indexer = indexer[~mask]
   6020         return self.delete(indexer)

KeyError: '[ True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True] not found in axis'

JavaScript
 
KeyError                                  Traceback (most recent call last)
<ipython-input-55-cf18265ca50e> in <module>
----> 1 merging_to_csv()
​
<ipython-input-54-698c67461b34> in merging_to_csv()
     51         q = (df["Nodes"] != 4) & (df["Members"] != 3)
     52         # Inplace drop (no extra copy in RAM):
---> 53         df.drop(q, inplace=True)
     54         # Append data to disk instead of RAM:
     55         df.to_csv("output.csv", mode="a", header=False)
​
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper
​
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4906             level=level,
   4907             inplace=inplace,
-> 4908             errors=errors,
   4909         )
   4910 
​
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4148         for axis, labels in axes.items():
   4149             if labels is not None:
-> 4150                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4151 
   4152         if inplace:
​
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
   4183                 new_axis = axis.drop(labels, level=level, errors=errors)
   4184             else:
-> 4185                 new_axis = axis.drop(labels, errors=errors)
   4186             result = self.reindex(**{axis_name: new_axis})
   4187 
​
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
   6016         if mask.any():
   6017             if errors != "ignore":
-> 6018                 raise KeyError(f"{labels[mask]} not found in axis")
   6019             indexer = indexer[~mask]
   6020         return self.delete(indexer)
​
KeyError: '[ True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True] not found in axis'
​

What’s wrong? I’ll upload two smallest json files here: https://drive.google.com/drive/folders/1xlC-kK6NLGr0isdy1Ln2tzGmel45GtPC?usp=sharing

Answer

You are facing multiple issue in your original approach:

Multiple copy of dataframe: df = df.drop(...);
Whole information stored in RAM because of append;
Unnecessary for loop to filter rows, use boolean indexing instead.

Here is baseline snippet to solve your problem based on data sample you provided:

import json
import pathlib
import pandas as pd
    
# Iterate source files:
for path in pathlib.Path(".").glob("result*.json"):
    # Open source file one by one:
    with path.open() as handler:
        # Normalize JSON model:
        df = pd.json_normalize(json.load(handler), record_path =['List2D'], max_level=2, sep="-")
    # Apply len to list fields to identify rows to drop (boolean indexing):
    q = (df["Nodes"].apply(len) != 4) & (df["Members"].apply(len) != 3)
    # Filter and append data to disk instead of RAM:
    df.loc[~q,:].to_csv("output.csv", mode="a", header=False)

JavaScript
 
import json
import pathlib
import pandas as pd
    
# Iterate source files:
for path in pathlib.Path(".").glob("result*.json"):
    # Open source file one by one:
    with path.open() as handler:
        # Normalize JSON model:
        df = pd.json_normalize(json.load(handler), record_path =['List2D'], max_level=2, sep="-")
    # Apply len to list fields to identify rows to drop (boolean indexing):
    q = (df["Nodes"].apply(len) != 4) & (df["Members"].apply(len) != 3)
    # Filter and append data to disk instead of RAM:
    df.loc[~q,:].to_csv("output.csv", mode="a", header=False)
​

It loads file one by one in RAM then append filtered rows to disk not to RAM. Those fixes will drastically reduce RAM usage and should be kept as high as twice the biggest JSON file.

Advertisement

Answer