Skip to content
Advertisement

extract value from a list of json in pyspark

I have a dataframe where a column is in the form of a list of json. I want to extract a specific value (score) from the column and create independent columns.

raw_data = [{"user_id" : 1234, "col" : [{"id":14577120145280,"score":64.71,"Elastic_position":0},{"id":14568530280240,"score":88.53,"Elastic_position":1},{"id":14568530119661,"score":63.75,"Elastic_position":2},{"id":14568530205858,"score":62.79,"Elastic_position":3},{"id":14568530414899,"score":60.88,"Elastic_position":4}]}]

df = pd.DataFrame.from_dict(raw_data)

I want to explode my result dataframe as:

enter image description here

Advertisement

Answer

Assuming you have your json looks like this

# a.json
# {
#     "user_id" : 1234,
#     "col" : [
#         {"id":14577120145280,"score":64.71,"Elastic_position":0},
#         {"id":14568530280240,"score":88.53,"Elastic_position":1},
#         {"id":14568530119661,"score":63.75,"Elastic_position":2},
#         {"id":14568530205858,"score":62.79,"Elastic_position":3},
#         {"id":14568530414899,"score":60.88,"Elastic_position":4}
#     ]
# }

You can read it, flatten it, then pivot it like so

from pyspark.sql import functions as F
from pyspark.sql import types as T

schema = T.StructType([
    T.StructField('user_id', T.IntegerType()),
    T.StructField('col', T.StringType()),
])

df = spark.read.json('a.json', multiLine=True, schema=schema)
df.show(10, False)

# +-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
# |user_id|col                                                                                                                                                                                                                                                                                           |
# +-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
# |1234   |[{"id":14577120145280,"score":64.71,"Elastic_position":0},{"id":14568530280240,"score":88.53,"Elastic_position":1},{"id":14568530119661,"score":63.75,"Elastic_position":2},{"id":14568530205858,"score":62.79,"Elastic_position":3},{"id":14568530414899,"score":60.88,"Elastic_position":4}]|
# +-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


df.printSchema()
# root
#  |-- user_id: integer (nullable = true)
#  |-- col: string (nullable = true)

(df
    # this will parse your JSON string to JSON object
    .withColumn('col', F.from_json(
        F.col('col'),
        T.ArrayType(T.StructType([
            T.StructField('id', T.LongType()),
            T.StructField('score', T.DoubleType()),
            T.StructField('Elastic_position', T.IntegerType()),
        ]))
    ))
 
    .select('user_id', F.explode('col'))
    .groupBy('user_id')
    .pivot('col.Elastic_position')
    .agg(F.first('col.score'))
    .show(10, False)
)

# Output
# +-------+-----+-----+-----+-----+-----+
# |user_id|0    |1    |2    |3    |4    |
# +-------+-----+-----+-----+-----+-----+
# |1234   |64.71|88.53|63.75|62.79|60.88|
# +-------+-----+-----+-----+-----+-----+
User contributions licensed under: CC BY-SA
2 People found this is helpful
Advertisement