Skip to content
Advertisement

How to fix memory error while importing a very large csv file to mongodb in python?

Given below is the code for importing a pipe delimited csv file to monogdb.

import csv
import json
from pymongo import MongoClient

url = "mongodb://localhost:27017"
client = MongoClient(url)
db = client.Office
customer = db.Customer
jsonArray = []

with open("Names.txt", "r") as csv_file:
    csv_reader = csv.DictReader(csv_file, dialect='excel', delimiter='|', quoting=csv.QUOTE_NONE)
    for row in csv_reader:
        jsonArray.append(row)
    jsonString = json.dumps(jsonArray, indent=1, separators=(",", ":"))
    jsonfile = json.loads(jsonString)
    customer.insert_many(jsonfile)

Below is the error I get when running the above code.

Traceback (most recent call last):
  File "E:Anaconda ProjectsMongo ProjectsOffice Toolcsvtojson.py", line 16, in <module>
    jsonString = json.dumps(jsonArray, indent=1, separators=(",", ":"))
  File "C:UsersPredatoranaconda3libjson__init__.py", line 234, in dumps
    return cls(
  File "C:UsersPredatoranaconda3libjsonencoder.py", line 201, in encode
    chunks = list(chunks)
MemoryError

I if modify the code with some indents under the for loop. The MongoDB gets imported with the same data all over again without stopping.

import csv
import json
from pymongo import MongoClient

url = "mongodb://localhost:27017"
client = MongoClient(url)
db = client.Office
customer = db.Customer
jsonArray = []

with open("Names.txt", "r") as csv_file:
    csv_reader = csv.DictReader(csv_file, dialect='excel', delimiter='|', quoting=csv.QUOTE_NONE)
    for row in csv_reader:
        jsonArray.append(row)
        jsonString = json.dumps(jsonArray, indent=1, separators=(",", ":"))
        jsonfile = json.loads(jsonString)
        customer.insert_many(jsonfile)

Advertisement

Answer

The memory issue can be solved by inserting one record at a time.

import csv
import json

from pymongo import MongoClient

url_mongo = "mongodb://localhost:27017"
client = MongoClient(url_mongo)
db = client.Office
customer = db.Customer
jsonArray = []
file_txt = "Text.txt"
rowcount = 0
with open(file_txt, "r") as txt_file:
    csv_reader = csv.DictReader(txt_file, dialect="excel", delimiter="|", quoting=csv.QUOTE_NONE)
    for row in csv_reader:
        rowcount += 1
        jsonArray.append(row)
    for i in range(rowcount):
        jsonString = json.dumps(jsonArray[i], indent=1, separators=(",", ":"))
        jsonfile = json.loads(jsonString)
        customer.insert_one(jsonfile)
print("Finished")

Thank You All for Your Ideas

User contributions licensed under: CC BY-SA
6 People found this is helpful
Advertisement