Skip to content
Advertisement

slow update of large JSON File

I have a large JSON file that contains serialized json dicts. I am trying to iterate this file and update based on the contents of another dict. The JSON file looks similar to:

 sample_json = {
  "targets": [
    {
      "start": 40,
      "end": 73,
      "item": "trust:team_member",
      "lookup": "the cathedral"
    },
    {
      "start": 40,
      "end": 74,
      "item": "contact",
      "lookup": "some text"
    }
  ]},{
    "targets": [
    {
      "start": 40,
      "end": 73,
      "item": "trust:team_member",
      "lookup": "the cats"
    },
    {
      "start": 40,
      "end": 74,
      "item": "music",
      "lookup": "some other music note"
    }
  ]
}

The function below, iterates each line of the JSON file and creates a “key_value” that’s a concatenation of item and lookup key value pairs.

def new_json(qle_folder, param_values):
    new_json = []
    files = [os.path.join(qle_folder, f) for f in os.listdir(qle_folder) if os.path.isfile(os.path.join(qle_folder, f))]

    for filename in files:
        with open(filename) as f:
            lines = f.readlines()
            total = len(lines)
            desc = filename
            for l in tqdm(lines, total=total, desc=desc):
                doc = json.loads(l)
                targets = len(doc['targets'])
                if targets_length == 0:
                    continue               
                item_dict = defaultdict(list)


                for i in range(target_entities_length):
                    item = doc['targets'][i]['item'].strip().lower()
                    lookup = doc['targets'][i]['lookup'].strip().lower()
                    d={'param_type': item, 'param_value': lookup}
                    key_value = str(d).strip().lower()
                    key_value = json.dumps(key_value)
                    try:
                        value= param_values[key_value] 
                        value= str(value).strip("[]"'").lower()
                        item_dict[item].append(value)
                    except KeyError:
                        #print(f'key error: {key_values}')
                        continue
                  
                
                if item_dict:
                    item_dict = dict(item_dict)
                    
                    doc['items'] ={}
                    doc['items']['Tags']= item_dict
                else:
                    continue
                    
         
                doc = json.dumps(doc)
                qle_index.append(doc + 'n')
    
    return qle_index

The function seems to run really slow and freezes half way through.

param_values = {'{"param_type": "trust:team_member", "param_value": "the cathedral"}': ["['test 1', 'test 3', 'test 4']"],
 '{"param_type": "contact", "param_value": "some text"}': ["[''test 5', 'test 3', 'test 4'']"]}

Advertisement

Answer

There are a lot of strange things in your code, I removed several of them in an attempt to give you something that isn’t slow.

Start with something that separates your logic and lets you avoid storing stuff in memory. This reads the tmp file at the end and replays it over, but you could also just do a file-level rename if you wanted (or keep the new one, etc.)

def update_qle(qle_path, tag_lookup):
    with tempfile.TemporaryFile() as tmp:
        with open(qle_path, "rb") as f:
            for json_line in f:
                data = json.loads(json_line)
                if successfully_modified_in_place(data, tag_lookup):
                    writeLine(tmp, data)
        tmp.seek(0)
        with open(qle_path, "wb") as f:
            f.write(tmp.read())

This leaves convert and writeLine to be defined. Your original logic has strange behavior but I think it is supposed to look like this

def successfully_modified_in_place(data, tag_lookup):
    tags = {}
    for target in data["targets"]:
        item = target["item"].strip().lower()
        lookup = target["lookup"].strip().lower()
        tag = tag_lookup.get(item, {}).get(lookup)
        if not tag:
            continue
        tags[item] = tag
    data["items"] = {"Tags": tags}
    return bool(tags)

def writeLine(fp, data):
    fp.write((json.dumps(data) + "n").encode())

A testable main function might look like this:

def main():
    data = [
        {
            "targets": [
                {"start": 40, "end": 73, "item": "trust:team_member", "lookup": "the cathedral"},
                {"start": 40, "end": 74, "item": "contact", "lookup": "some text"},
            ]
        },
        {
            "targets": [
                {"start": 40, "end": 73, "item": "trust:team_member", "lookup": "the cats"},
                {"start": 40, "end": 74, "item": "music", "lookup": "some other music note"},
            ]
        },
    ]
    path = "./data.jsonLines"
    with open(path, "wb") as f:
        for item in data:
            writeLine(f, item)
    lookup = {
        "trust:team_member": {"the cathedral": "test 1 test 3 test 4"},
        "contact": {"some text": "test 5 test 3 test 4"},
    }
    update_qle(path, lookup)

which we would expect to generate a file like

{
  "targets": [
    {
      "start": 40,
      "end": 73,
      "item": "trust:team_member",
      "lookup": "the cathedral"
    },
    {
      "start": 40,
      "end": 74,
      "item": "contact",
      "lookup": "some text"
    }
  ],
  "items": {
    "Tags": {
      "trust:team_member": "test 1 test 3 test 4",
      "contact": "test 5 test 3 test 4"
    }
  }
}

since the second element of data has no matching tags, so will be skipped.

User contributions licensed under: CC BY-SA
3 People found this is helpful
Advertisement