I have a dataframe where a column is in the form of a list of json. I want to extract a specific value (score) from the column and create independent columns.
raw_data = [{"user_id" : 1234, "col" : [{"id":14577120145280,"score":64.71,"Elastic_position":0},{"id":14568530280240,"score":88.53,"Elastic_position":1},{"id":14568530119661,"score":63.75,"Elastic_position":2},{"id":14568530205858,"score":62.79,"Elastic_position":3},{"id":14568530414899,"score":60.88,"Elastic_position":4}]}] df = pd.DataFrame.from_dict(raw_data)
I want to explode my result dataframe as:
Advertisement
Answer
Assuming you have your json looks like this
# a.json # { # "user_id" : 1234, # "col" : [ # {"id":14577120145280,"score":64.71,"Elastic_position":0}, # {"id":14568530280240,"score":88.53,"Elastic_position":1}, # {"id":14568530119661,"score":63.75,"Elastic_position":2}, # {"id":14568530205858,"score":62.79,"Elastic_position":3}, # {"id":14568530414899,"score":60.88,"Elastic_position":4} # ] # }
You can read it, flatten it, then pivot it like so
from pyspark.sql import functions as F from pyspark.sql import types as T schema = T.StructType([ T.StructField('user_id', T.IntegerType()), T.StructField('col', T.StringType()), ]) df = spark.read.json('a.json', multiLine=True, schema=schema) df.show(10, False) # +-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ # |user_id|col | # +-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ # |1234 |[{"id":14577120145280,"score":64.71,"Elastic_position":0},{"id":14568530280240,"score":88.53,"Elastic_position":1},{"id":14568530119661,"score":63.75,"Elastic_position":2},{"id":14568530205858,"score":62.79,"Elastic_position":3},{"id":14568530414899,"score":60.88,"Elastic_position":4}]| # +-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ df.printSchema() # root # |-- user_id: integer (nullable = true) # |-- col: string (nullable = true) (df # this will parse your JSON string to JSON object .withColumn('col', F.from_json( F.col('col'), T.ArrayType(T.StructType([ T.StructField('id', T.LongType()), T.StructField('score', T.DoubleType()), T.StructField('Elastic_position', T.IntegerType()), ])) )) .select('user_id', F.explode('col')) .groupBy('user_id') .pivot('col.Elastic_position') .agg(F.first('col.score')) .show(10, False) ) # Output # +-------+-----+-----+-----+-----+-----+ # |user_id|0 |1 |2 |3 |4 | # +-------+-----+-----+-----+-----+-----+ # |1234 |64.71|88.53|63.75|62.79|60.88| # +-------+-----+-----+-----+-----+-----+