Skip to content
Advertisement

Column with column names for nulls in row

I want to add new column “Null_Values” in PySpark dataframe as below

=======================================================|
ID | Maths | Science | English | NUll_Values           |
=======================================================|
11 | 80    | NULL    | 89      | Science               |
12 | NULL  | NULL    | 89      | Maths,Science         |
13 | 90    | 95      | 85      | NULL                  |
14 | NULL  | NULL    | NULL    | Maths,Science,English |
=======================================================|

Advertisement

Answer

df = df.withColumn(
    "NUll_Values",
    F.concat_ws(",", *[F.when(F.col(c).isNull(), c) for c in df.columns])
)

df.show(truncate=False)
#  +---+-----+-------+-------+---------------------+
#  |ID |Maths|Science|English|NUll_Values          |
#  +---+-----+-------+-------+---------------------+
#  |11 |80   |null   |89     |Science              |
#  |12 |null |null   |89     |Maths,Science        |
#  |13 |90   |95     |85     |                     |
#  |14 |null |null   |null   |Maths,Science,English|
#  +---+-----+-------+-------+---------------------+

or

df = df.withColumn(
    "NUll_Values",
    F.concat_ws(",", *[F.when(F.col(c).isNull(), c) for c in df.columns])
).replace("", None, subset=["NUll_Values"])

df.show(truncate=False)
#  +---+-----+-------+-------+---------------------+
#  |ID |Maths|Science|English|NUll_Values          |
#  +---+-----+-------+-------+---------------------+
#  |11 |80   |null   |89     |Science              |
#  |12 |null |null   |89     |Maths,Science        |
#  |13 |90   |95     |85     |null                 |
#  |14 |null |null   |null   |Maths,Science,English|
#  +---+-----+-------+-------+---------------------+
User contributions licensed under: CC BY-SA
1 People found this is helpful
Advertisement