I have a large JSON file that contains serialized json dicts. I am trying to iterate this file and update based on the contents of another dict. The JSON file looks similar to:
sample_json = { "targets": [ { "start": 40, "end": 73, "item": "trust:team_member", "lookup": "the cathedral" }, { "start": 40, "end": 74, "item": "contact", "lookup": "some text" } ]},{ "targets": [ { "start": 40, "end": 73, "item": "trust:team_member", "lookup": "the cats" }, { "start": 40, "end": 74, "item": "music", "lookup": "some other music note" } ] }
The function below, iterates each line of the JSON file and creates a “key_value” that’s a concatenation of item and lookup key value pairs.
def new_json(qle_folder, param_values): new_json = [] files = [os.path.join(qle_folder, f) for f in os.listdir(qle_folder) if os.path.isfile(os.path.join(qle_folder, f))] for filename in files: with open(filename) as f: lines = f.readlines() total = len(lines) desc = filename for l in tqdm(lines, total=total, desc=desc): doc = json.loads(l) targets = len(doc['targets']) if targets_length == 0: continue item_dict = defaultdict(list) for i in range(target_entities_length): item = doc['targets'][i]['item'].strip().lower() lookup = doc['targets'][i]['lookup'].strip().lower() d={'param_type': item, 'param_value': lookup} key_value = str(d).strip().lower() key_value = json.dumps(key_value) try: value= param_values[key_value] value= str(value).strip("[]"'").lower() item_dict[item].append(value) except KeyError: #print(f'key error: {key_values}') continue if item_dict: item_dict = dict(item_dict) doc['items'] ={} doc['items']['Tags']= item_dict else: continue doc = json.dumps(doc) qle_index.append(doc + 'n') return qle_index
The function seems to run really slow and freezes half way through.
param_values = {'{"param_type": "trust:team_member", "param_value": "the cathedral"}': ["['test 1', 'test 3', 'test 4']"], '{"param_type": "contact", "param_value": "some text"}': ["[''test 5', 'test 3', 'test 4'']"]}
Advertisement
Answer
There are a lot of strange things in your code, I removed several of them in an attempt to give you something that isn’t slow.
Start with something that separates your logic and lets you avoid storing stuff in memory. This reads the tmp file at the end and replays it over, but you could also just do a file-level rename if you wanted (or keep the new one, etc.)
def update_qle(qle_path, tag_lookup): with tempfile.TemporaryFile() as tmp: with open(qle_path, "rb") as f: for json_line in f: data = json.loads(json_line) if successfully_modified_in_place(data, tag_lookup): writeLine(tmp, data) tmp.seek(0) with open(qle_path, "wb") as f: f.write(tmp.read())
This leaves convert
and writeLine
to be defined. Your original logic has strange behavior but I think it is supposed to look like this
def successfully_modified_in_place(data, tag_lookup): tags = {} for target in data["targets"]: item = target["item"].strip().lower() lookup = target["lookup"].strip().lower() tag = tag_lookup.get(item, {}).get(lookup) if not tag: continue tags[item] = tag data["items"] = {"Tags": tags} return bool(tags) def writeLine(fp, data): fp.write((json.dumps(data) + "n").encode())
A testable main
function might look like this:
def main(): data = [ { "targets": [ {"start": 40, "end": 73, "item": "trust:team_member", "lookup": "the cathedral"}, {"start": 40, "end": 74, "item": "contact", "lookup": "some text"}, ] }, { "targets": [ {"start": 40, "end": 73, "item": "trust:team_member", "lookup": "the cats"}, {"start": 40, "end": 74, "item": "music", "lookup": "some other music note"}, ] }, ] path = "./data.jsonLines" with open(path, "wb") as f: for item in data: writeLine(f, item) lookup = { "trust:team_member": {"the cathedral": "test 1 test 3 test 4"}, "contact": {"some text": "test 5 test 3 test 4"}, } update_qle(path, lookup)
which we would expect to generate a file like
{ "targets": [ { "start": 40, "end": 73, "item": "trust:team_member", "lookup": "the cathedral" }, { "start": 40, "end": 74, "item": "contact", "lookup": "some text" } ], "items": { "Tags": { "trust:team_member": "test 1 test 3 test 4", "contact": "test 5 test 3 test 4" } } }
since the second element of data has no matching tags, so will be skipped.