Skip to content
Advertisement

pandas assign across multiple columns functionally

Is there a way, in pandas, to apply a function to some chosen columns, while strictly keeping a functional pipeline (no border effects,no assignation before the result, the result of the function only depends of its arguments, and I don’t want to drop the other columns). Ie, what is the equivalent of across in R ?

import pandas as pd
df = (
    pd.DataFrame({
    "column_a":[0,3,4,2,1],
    "column_b":[1,2,4,5,18],
    "column_c":[2,4,25,25,26],
    "column_d":[2,4,-1,5,2],
    "column_e":[-1,-7,-8,-9,3]
    })
    .assign(column_a=lambda df:df["column_a"]+20)
    .assign(column_c=lambda df:df["column_c"]+20)
    .assign(column_e=lambda df:df["column_e"]/3)
    .assign(column_b=lambda df:df["column_b"]/3)
)
print(df)

# column_a  column_b  column_c  column_d  column_e
# 0        20  0.333333        22         2 -0.333333
# 1        23  0.666667        24         4 -2.333333
# 2        24  1.333333        45        -1 -2.666667
# 3        22  1.666667        45         5 -3.000000
# 4        21  6.000000        46         2  1.000000

In R, I would have written :

library(dplyr)
df <-
tibble(
  column_a = c(0,3,4,2,1),
  column_b = c(1,2,4,5,18),
  column_c = c(2,4,25,25,26),
  column_d = c(2,4,-1,5,2),
  column_e = c(-1,-7,-8,-9,3)
) %>%
  mutate(across(c(column_a,column_c),~.x + 20),
         across(c(column_e,column_b),~.x / 3))

# # A tibble: 5 × 5
#   column_a column_b column_c column_d column_e
#      <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
# 1       20    0.333       22        2   -0.333
# 2       23    0.667       24        4   -2.33 
# 3       24    1.33        45       -1   -2.67 
# 4       22    1.67        45        5   -3    
# 5       21    6           46        2    1 

Advertisement

Answer

One option is to unpack the computation within assign:

(df
.assign(**df.loc(axis=1)[['column_a', 'column_c']].add(20), 
        **df.loc[:, ['column_e', 'column_b']].div(3))
)
   column_a  column_b  column_c  column_d  column_e
0        20  0.333333        22         2 -0.333333
1        23  0.666667        24         4 -2.333333
2        24  1.333333        45        -1 -2.666667
3        22  1.666667        45         5 -3.000000
4        21  6.000000        46         2  1.000000

For readability purposes, I’d suggest splitting it up:

first = df.loc(axis=1)[['column_a', 'column_c']].add(20)
second = df.loc[:, ['column_e', 'column_b']].div(3)
df.assign(**first, **second)

   column_a  column_b  column_c  column_d  column_e
0        20  0.333333        22         2 -0.333333
1        23  0.666667        24         4 -2.333333
2        24  1.333333        45        -1 -2.666667
3        22  1.666667        45         5 -3.000000
4        21  6.000000        46         2  1.000000

Another option, still with the unpacking idea is to iterate through the columns, based on the pattern:

mapper = {key : value.add(20) 
          if key.endswith(('a','c')) 
          else value.div(3) 
          if key.endswith(('e','b')) 
          else value 
          for key, value 
          in df.items()}

df.assign(**mapper)
   column_a  column_b  column_c  column_d  column_e
0        20  0.333333        22         2 -0.333333
1        23  0.666667        24         4 -2.333333
2        24  1.333333        45        -1 -2.666667
3        22  1.666667        45         5 -3.000000
4        21  6.000000        46         2  1.000000

You can dump it into a function and then pipe it:

def func(f):
    mapp = {}
    for key, value in f.items():
        if key in ('column_a', 'column_c'):
            value = value + 20
        elif key in ('column_e', 'column_b'):
            value = value / 3
        mapp[key] = value
    return f.assign(**mapp)

df.pipe(func)

   column_a  column_b  column_c  column_d  column_e
0        20  0.333333        22         2 -0.333333
1        23  0.666667        24         4 -2.333333
2        24  1.333333        45        -1 -2.666667
3        22  1.666667        45         5 -3.000000
4        21  6.000000        46         2  1.000000

We can take the function declaration a step further for easier use :

def across(df, columns, func):
    result = func(df.loc[:, columns])
    return df.assign(**result)

(df
.pipe(across, ['column_a', 'column_c'], lambda df: df + 20)
.pipe(across, ['column_e', 'column_b'], lambda df: df / 3)
) 
   column_a  column_b  column_c  column_d  column_e
0        20  0.333333        22         2 -0.333333
1        23  0.666667        24         4 -2.333333
2        24  1.333333        45        -1 -2.666667
3        22  1.666667        45         5 -3.000000
4        21  6.000000        46         2  1.000000

pyjanitor has a transform_columns function that can be handy for this:

(df
.transform_columns(['column_a', 'column_c'], lambda df: df + 20)
.transform_columns(['column_e', 'column_b'], lambda df: df / 3)
) 
   column_a  column_b  column_c  column_d  column_e
0        20  0.333333        22         2 -0.333333
1        23  0.666667        24         4 -2.333333
2        24  1.333333        45        -1 -2.666667
3        22  1.666667        45         5 -3.000000
4        21  6.000000        46         2  1.000000
User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement