I am trying to build a tool which can take any JSON data and convert that into multiple data frame based on data types. I am trying to add each data frame with a relation so that we can identify which data belong to which parent element(key).
For Example :
{
"name":"Some name"
"date": "12:23:2022"
"Students":[
{
"id":",some id"
"value": "some val"
},
{
"id":",some id2"
"value": "some val2"
}, {
"id":",some id3"
"value": "some val3"
},
],
"Error":[
{
"id":",some id",
"code": "some code",
"emessage":[
{
"err_trac":"Missing syntax",
"Err_code":";"
},
{
"err_trac":"invalid syntax",
"Err_code":"="
}
]
},
{
"id":",some id2",
"code": "some code 2",
"emessage":[
{
"err_trac":"Missing syntax",
"Err_code":";"
},
{
"err_trac":"invalid syntax",
"Err_code":"="
}
]
}, {
"id":",some id3",
"code": "some code3",
"emessage":[
{
"err_trac":"Missing syntax",
"Err_code":";"
},
{
"err_trac":"invalid syntax",
"Err_code":"="
}
]
},
]
}
I wanted to have data frame such as
Run
name, date , id (uuid)
Error
id, code parent_id(id of run), id (uuid)
Students
id, value, parent_id(id of run) , id (uuid)
emessage
err_trac, Err_code , parent_id(id of Error )
And have a relations with UUID to identify which key belongs to which parents id. I am trying the flattening approach to solve this problem using python and pandas . But my solution does not works for nested JSON.
Here is what I am trying.
import json
import pandas as pd
op = {}
import uuid
def format_string(string):
return string.replace(" ", "_")
def get_prefix(prefix, key):
if not key:
return format_string(prefix)
if prefix:
return format_string(prefix + "_" + key)
else:
return key
def flatten(prefix, key, value, uid, result=[]):
if isinstance(value, str):
result.append({get_prefix(prefix, key): value})
if isinstance(value, dict):
for item in value.keys():
flatten(get_prefix(prefix, key), item, value.get(item), uid, result)
if isinstance(value, list):
if prefix:
for i in range(len(value)):
flatten(
get_prefix(prefix, key + "[{}]".format(i)),
"",
value[i],
uid,
op[key],
)
else:
for i in range(len(value)):
flatten(
get_prefix(prefix, key + "[{}]".format(i)),
"",
value[i],
uid,
result,
)
res = {key: val for d in result for key, val in d.items()}
df = pd.DataFrame.from_dict(res, orient="index")
df["uuid"] = uid
op["result"] = df
return result
def solution() -> str:
f = open("example-input/sample.json", "r")
if f:
str_val = json.load(f)
print("j")
for key, value in str_val.items():
# pd_op = pd.json_normalize(str_val)
# print(pd_op.columns)
# for x in pd_op["run.tip usage"]:
# print(x[0])
# break
flatten("", key, str_val.get(key), uuid.uuid4())
return op
print(solution())
Update
The reason I wanted to create multiple dataframe is to put this data into Datalake and later access it via Athena in AWS. Once I get the dataframe I can move them into SQL tables.
Advertisement
Answer
The structure you are describing – a JSON of an indefinitely defined number of nested JSONs – fits exactly with a tree data structure. Since we are looking to store the ID of the parent JSON in each dataframe, we will approach this with BFS (breadth first search) a.k.a. level order traversal. This is a common graph traversal algorithm well suited to this kind of problem.
If a element has an id of None, it indicates it is the root or top level element.
import json
import pandas as pd
import uuid
def nested_json_list_df(file : str):
dict_list = []
def bfs(json_dict : dict, node_name : str, parent_uuid : uuid.UUID):
"""Breadth First Search a.k.a. Level order Traversal
"""
# Create parent node
uuid_val = uuid.uuid1()
out_dict = {'id': node_name, 'uuid': uuid_val, 'parent id': parent_uuid}
# Search child nodes
for key, val in json_dict.items():
# If a child node is a dict itself, it is a sub-nested JSON
if isinstance(val, dict):
bfs(val, key, uuid_val)
# A list of single-nested dicts is simply a new entry
# A list containing dicts within dicts is interpreted
# as another nested JSON
elif isinstance(val, list):
new_val = []
for v in val:
if isinstance(v, dict):
new_dict = dict()
for key2, val2 in v.items():
# Indicates nested JSONs
if isinstance(val2, dict):
bfs(val2, key, uuid_val)
else:
new_dict[key2] = val2
new_val.append(new_dict)
else:
new_val.append(v)
uuid2 = uuid.uuid1()
out_dict2 = {'id': key, 'uuid': uuid2, 'parent id': parent_uuid, key : new_val}
dict_list.append({key : out_dict2})
else:
out_dict[key] = val
dict_list.append({node_name : out_dict})
return dict_list
## Run BFS ##
with open(file) as f:
json_dict = json.load(f)
df_list = []
for d in bfs(json_dict, file, None):
df_list.append(pd.DataFrame(d))
return df_list
df_list = nested_json_list_df('temp.json')
for df in df_list:
print(df)
Output:
Students
Students [{'id': ',some id', 'value': 'some val'}, {'id...
id Students
parent id None
uuid 2d68cce3-c7f7-11ec-81a3-b0227ae68aa0
Error
Error [{'id': ',some id', 'code': 'some code', 'emes...
id Error
parent id None
uuid 2d68cce4-c7f7-11ec-b0da-b0227ae68aa0
temp.json
date 12:23:2022
id temp.json
name Some name
parent id None
uuid 2d68cce2-c7f7-11ec-bcd8-b0227ae68aa0