I want to add new column “Null_Values” in PySpark dataframe as below
=======================================================| ID | Maths | Science | English | NUll_Values | =======================================================| 11 | 80 | NULL | 89 | Science | 12 | NULL | NULL | 89 | Maths,Science | 13 | 90 | 95 | 85 | NULL | 14 | NULL | NULL | NULL | Maths,Science,English | =======================================================|
Advertisement
Answer
df = df.withColumn( "NUll_Values", F.concat_ws(",", *[F.when(F.col(c).isNull(), c) for c in df.columns]) ) df.show(truncate=False) # +---+-----+-------+-------+---------------------+ # |ID |Maths|Science|English|NUll_Values | # +---+-----+-------+-------+---------------------+ # |11 |80 |null |89 |Science | # |12 |null |null |89 |Maths,Science | # |13 |90 |95 |85 | | # |14 |null |null |null |Maths,Science,English| # +---+-----+-------+-------+---------------------+
or
df = df.withColumn( "NUll_Values", F.concat_ws(",", *[F.when(F.col(c).isNull(), c) for c in df.columns]) ).replace("", None, subset=["NUll_Values"]) df.show(truncate=False) # +---+-----+-------+-------+---------------------+ # |ID |Maths|Science|English|NUll_Values | # +---+-----+-------+-------+---------------------+ # |11 |80 |null |89 |Science | # |12 |null |null |89 |Maths,Science | # |13 |90 |95 |85 |null | # |14 |null |null |null |Maths,Science,English| # +---+-----+-------+-------+---------------------+