I am trying to execute the same function on a spark dataframe rather than pandas.
JavaScript
x
6
1
def check_value(df):
2
lista=[]
3
for index,value in enumerate(df.columns):
4
lista.append('Column name: db_transactions/{} has {} % of white_space charachter and {} nullvalues'.format(df.columns[index],sum(list(map(lambda x: str(x).isspace(),df[value])))/df.shape[0],pd.isna(df[value]).sum()))
5
return lista
6
Advertisement
Answer
A direct translation would require you to do multiple collect
for each column calculation. I suggest you do all calculations for columns in the dataframe as a single row and then collect that row. Here’s an example.
JavaScript
1
14
14
1
# input dataframe, say `data_sdf`
2
# the blank values can have none or multiple whitespaces - ' ', '', ' ', etc.
3
# +--------+--------+
4
# | chars1| chars2|
5
# +--------+--------+
6
# | | |
7
# | | |
8
# | blah | blah |
9
# | blah| blah|
10
# | blah | blah |
11
# | blah| blah|
12
# | null| |
13
# +--------+--------+
14
Calculate percentage of whitespace values and number of null values for all columns.
JavaScript
1
11
11
1
calc_sdf = data_sdf.
2
select(*[(func.sum(func.trim(func.col(colname)).like('').cast('int')) / func.count('*')).alias(colname+'_wspace') for colname in data_sdf.columns],
3
*[func.sum(func.col(colname).isNull().cast('int')).alias(colname+'_null') for colname in data_sdf.columns]
4
)
5
6
# +------------------+-------------------+-----------+-----------+
7
# | chars1_wspace| chars2_wspace|chars1_null|chars2_null|
8
# +------------------+-------------------+-----------+-----------+
9
# |0.2857142857142857|0.42857142857142855| 1| 0|
10
# +------------------+-------------------+-----------+-----------+
11
We can convert the calculated fields as a dictionary for easy use in the lista
creation.
JavaScript
1
9
1
calc_dict = calc_sdf.rdd.
2
map(lambda k: k.asDict()).
3
collect()[0]
4
5
# {'chars1_null': 1,
6
# 'chars1_wspace': 0.2857142857142857,
7
# 'chars2_null': 0,
8
# 'chars2_wspace': 0.42857142857142855}
9
Use the calc_dict
in the lista
creation.
JavaScript
1
12
12
1
lista = []
2
3
for colname in data_sdf.columns:
4
lista.append('Column name: db_transactions/{} has {} % of white_space character and {} nullvalues'.format(colname,
5
round(data_dict[colname+'_wspace'] * 100, 2),
6
data_dict[colname+'_null']
7
)
8
)
9
10
# ['Column name: db_transactions/chars1 has 28.57 % of white_space character and 1 nullvalues',
11
# 'Column name: db_transactions/chars2 has 42.86 % of white_space character and 0 nullvalues']
12