Skip to content
Advertisement

Convert a pandas df to defaultdict and look for matching positive and negative numbers

I have a dataframe like this:

from collections import defaultdict
import pandas as pd

d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
     'label':['A','A','B','G','A','BB','C','C','A','BB','B','AA','AA']
    ,'amount':[2,-12,12,-12,5,-5,2,3,5,3,3,10,-10]}
df = pd.DataFrame(d)
print(df)
    id  label   amount
0   1   A       2
1   1   A      -12
2   1   B       12
3   1   G      -12
4   2   A       5
5   2   BB     -5
6   3   C      2
7   3   C      3
8   3   A      5
9   4   BB     3
10  4   B      3
11  4   AA    10
12  4   AA   -10

What i want to do know i convert the df into a defaultdict and look for only the matching matching positive and negative numbers per id so i want my desired dict to look something like this:

defaultdict(list,
            {1: [{'label': 'B', 'amount': 12},
                 {'label': 'A', 'amount': -12},
                 {'label': 'G', 'amount': -12}],
             2: [{'label': 'C',  'amount': 5}, {'label': 'A', 'amount': -5}],
             4: [{'label': 'AA', 'amount': 10},
                {'label': 'AA', 'amount': -10}]})

I try to do this below but i get this error:

d = defaultdict(list)
for  index,row in df.iterrows():
    if row['amount'].isin(-row['amount']):
        dd[row["id"]].append(
            {  "label": row["label"],
                'amount':row['amount'] })



----> 3     if row['amount'].isin(-row['amount']):
      4         dd[row["id"]].append(
      5             {  "label": row["label"],

AttributeError: 'int' object has no attribute 'isin'

Any idea how to solve this would be very appreciated. Thanks!

Advertisement

Answer

In pandas iterrows is not recommneded, check this answer. Alternative pandas only solution with same logic like your solution – per groups test if at least one match, filter by boolean indexing and convert to dictioanry per id with DataFrame.to_dict:

def f(x):
    a = x.to_numpy()
    return np.any(a == -a[:, None], axis=1)
 

d = (df[df.groupby('id')['amount'].transform(f)]
       .groupby('id')[['label','amount']]
       .apply(lambda x: x.to_dict(orient='records'))
       .to_dict())
  
print (d)
{1: [{'label': 'A', 'amount': -12}, 
     {'label': 'B', 'amount': 12},
     {'label': 'G', 'amount': -12}], 
 2: [{'label': 'A', 'amount': 5},
     {'label': 'BB', 'amount': -5}], 
 4: [{'label': 'AA', 'amount': 10},
     {'label': 'AA', 'amount': -10}]}

Or is possible filter all duplicated with absolute amount, but also is necessary test if exist negative with DataFrameGroupBy.nunique per groups:

f = lambda x: x.to_dict(orient='records')
df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) & 
     df1.duplicated(['id','amount'], keep=False))

d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
print (d)
{1: [{'label': 'A', 'amount': -12}, 
     {'label': 'B', 'amount': 12},
     {'label': 'G', 'amount': -12}],
 2: [{'label': 'A', 'amount': 5}, 
     {'label': 'BB', 'amount': -5}],
 4: [{'label': 'AA', 'amount': 10}, 
     {'label': 'AA', 'amount': -10}]}

If small DataFrame and performance not important need test values per groups by id, for test use == with Series.any:

from collections import defaultdict

df['negative_amount'] = - df['amount']
d = defaultdict(list)

for i, g in df.groupby('id'):
    for  index,row in g.iterrows():
        if (g['negative_amount'] == row['amount']).any():
            d[i].append({  "label": row["label"], 'amount':row['amount'] })
            
print (d)
defaultdict(<class 'list'>, {1: [{'label': 'A', 'amount': -12},
                                 {'label': 'B', 'amount': 12},
                                 {'label': 'G', 'amount': -12}], 
                             2: [{'label': 'A', 'amount': 5}, 
                                 {'label': 'BB', 'amount': -5}],
                             4: [{'label': 'AA', 'amount': 10},
                                 {'label': 'AA', 'amount': -10}]})

Performance in 10k rows and 1k groups, last solution is slowiest:

np.random.seed(123)

N = 10000
d = {'id': np.random.randint(1000, size=N),
     'label':np.random.choice(['A','A','B','G','A','BB','C','C','A','BB','B','AA','AA'], size=N),
     'amount':np.random.choice([2,-12,12,-12,5,-5,2,3,5,3,3,10,-10], size=N)}
df = pd.DataFrame(d).sort_values('id', ignore_index=True)
print(df)

In [47]: %%timeit
    ...: def f(x):
    ...:     a = x.to_numpy()
    ...:     return np.any(a == - a[:, None], axis=1)
    ...: 
    ...: d = (df[df.groupby('id')['amount'].transform(f)]
    ...:         .groupby('id')[['label','amount']]
    ...:         .apply(lambda x: x.to_dict(orient='records'))
    ...:         .to_dict())
    ...:         
225 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [48]: %%timeit
    ...: f = lambda x: x.to_dict(orient='records')
    ...: df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
    ...: m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) & 
    ...:      df1.duplicated(['id','amount'], keep=False))
    ...: 
    ...: d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
    ...: 
124 ms ± 9.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [49]: %%timeit
    ...: df['negative_amount'] = - df['amount']
    ...: d = defaultdict(list)
    ...: 
    ...: for i, g in df.groupby('id'):
    ...:     for  index,row in g.iterrows():
    ...:         if (g['negative_amount'] == row['amount']).any():
    ...:             d[i].append({  "label": row["label"], 'amount':row['amount'] })
    ...:             
3.51 s ± 366 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
User contributions licensed under: CC BY-SA
7 People found this is helpful
Advertisement