I have a dataset of duplicates (ID
). Dataset contains both information and emails.
I’m trying to concatenate the emails (if row have character @
) and then remove the duplicates.
My current code is a modification of Eric Ed Lohmar code and give the following output.
My issue is that I’m not able to exclude “noise” data as I get: , nan
, , 0
and -,
in my final result.
How do I append rows with only email addresses?
I thought that I could skip to append all rows that contain the character @
, by using wildcard and replacing this part:
if row['Store1_Email']: # <- not working
to any of these attempts, but nothing is working:
1.
if str('**@**') in row['Store1_Email']: # <- not working
Error:
Traceback (most recent call last):
File "g:/Till/till_duplicate.py", line 35, in <module>
if str('**@**') in row['Store1_Email']:
TypeError: argument of type 'float' is not iterable
PS G:Till>
Error:
Traceback (most recent call last):
File "g:/Till/till_duplicate.py", line 35, in <module>
if df_merged_duplicates[df_merged_duplicates.loc[i, 'Store1_Email'].str.contains('@')]:
AttributeError: 'str' object has no attribute 'str'
PS G:Till>
Full Code:
import pandas as pd
import os
from datetime import datetime
import time
from shutil import copyfile
from functools import reduce
import numpy as np
import glob
# # Settings
path_data_sources = 'G:/Till/'
# print(path_data_sources + 'test_duplicates - Copy.xlsx')
## https://stackoverflow.com/questions/36271413/pandas-merge-nearly-duplicate-rows-based-on-column-value
# df_merged_duplicates = pd.read_excel(path_data_sources + 'test_duplicates - Source.xlsx', sheet_name="Sheet1", dtype=str)
data = {'ID':['001', '001', '002', '002', '003', '003', '004', '004', '005', '005', '006', '006', '007', '007', '008', '008', '009', '009', '010', '010', '011', '011', '012', '012', '013', '013', '014', '014'],
'Header 1':['AA', 'AA', 'BB', 'BB', 'CC', 'CC', 'DD', 'DD', 'EE', 'EE', 'FF', 'FF', 'GG', 'GG', 'HH', 'HH', 'II', 'II', 'JJ', 'JJ', 'KK', 'KK', 'LL', 'LL', 'MM', 'MM', 'NN', 'NN'],
'Header 2':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 3':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 4':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 5':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Store1_Email':['Email@company1.com',np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,'Email@company2.com','Email@company2.com','Email@company3.com','Email@company3.com',np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,'Email@company4.com','Email@company4.com',np.nan,np.nan,np.nan,np.nan],
'Header 7':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 8':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 9':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Store2_Email':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 11':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 12':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Store3_Email':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 14':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 15':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 16':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 17':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Store4_Email':['Email2@company2.com','0',np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,'Email2@company3.com','-',np.nan,np.nan,'-','Email2@company4.com',np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
'Header 19':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]}
df_merged_duplicates = pd.DataFrame(data)
print(df_merged_duplicates)
df_merged_duplicates = df_merged_duplicates.sort_values(by=['ID']) # sort ID column
# Store 1 emails, merge
cmnts = {}
for i, row in df_merged_duplicates.iterrows():
while True:
try:
if row['Store1_Email']: # <- not working
cmnts[row['ID']].append(row['Store1_Email'])
else:
cmnts[row['ID']].append(np.nan)
break
except KeyError:
cmnts[row['ID']] = []
# Store 2 emails, merge
cmnts2 = {}
for i, row in df_merged_duplicates.iterrows():
while True:
try:
if row['Store2_Email']: # <- not working
cmnts2[row['ID']].append(row['Store2_Email'])
else:
cmnts2[row['ID']].append(np.nan)
break
except KeyError:
cmnts2[row['ID']] = []
# Store 3 emails, merge
cmnts3 = {}
for i, row in df_merged_duplicates.iterrows():
while True:
try:
if row['Store3_Email']: # <- not working
cmnts3[row['ID']].append(row['Store3_Email'])
else:
cmnts3[row['ID']].append(np.nan)
break
except KeyError:
cmnts3[row['ID']] = []
# Store 4 emails, merge
cmnts4 = {}
for i, row in df_merged_duplicates.iterrows():
while True:
try:
if row['Store4_Email']: # <- not working
cmnts4[row['ID']].append(row['Store4_Email'])
else:
cmnts4[row['ID']].append(np.nan)
break
except KeyError:
cmnts4[row['ID']] = []
df_merged_duplicates.drop_duplicates('ID', inplace=True)
df_merged_duplicates['Store1_Email'] = [', '.join(map(str, v)) for v in cmnts.values()]
df_merged_duplicates['Store2_Email'] = [', '.join(map(str, v)) for v in cmnts2.values()]
df_merged_duplicates['Store3_Email'] = [', '.join(map(str, v)) for v in cmnts3.values()]
df_merged_duplicates['Store4_Email'] = [', '.join(map(str, v)) for v in cmnts4.values()]
print(df_merged_duplicates)
df_merged_duplicates.to_excel(path_data_sources + 'test_duplicates_ny.xlsx', index=False)
Advertisement
Answer
I would use a “split-apply-combine” approach. In pandas you can use the groupby
function to do this and then apply a function to combine the email addresses to each group (in this case you can group by the ID
col.
I wrote a function to combine the email addresses for a given column:
def combine_emails(series):
strs = [s for s in series.astype(str).values if '@' in s]
combined_emails = ",".join(strs)
if combined_emails !='':
return combined_emails
else:
return np.nan
Then I wrote a function to take the first row of each grouped dataframe and call the combine function on the email columns to populate the row email values:
def combine_duplicate_rows(df):
first_row = df.iloc[0]
for email_col in ['Store1_Email', 'Store2_Email', 'Store3_Email', 'Store4_Email']:
first_row[email_col] = combine_emails(df[email_col])
return first_row
Then you can apply the combine_duplicate_rows
to your groups and you get the solution:
In [71]: df.groupby('ID').apply(combine_duplicate_rows)
Out[71]:
ID Header 1 Header 2 Header 3 Header 4 Header 5 Store1_Email Header 9 Store2_Email Header 12 Store3_Email Header 17 Store4_Email
ID
1 1 AA NaN NaN NaN NaN Email@company1.com NaN NaN NaN NaN NaN Email2@company2.com
2 2 BB NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 3 CC NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 4 DD NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 5 EE NaN NaN NaN NaN Email@company2.com,Email@company2.com NaN NaN NaN NaN NaN NaN
6 6 FF NaN NaN NaN NaN Email@company3.com,Email@company3.com NaN NaN NaN NaN NaN NaN
7 7 GG NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 8 HH NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 9 II NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Email2@company3.com
10 10 JJ NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11 11 KK NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Email2@company4.com
12 12 LL NaN NaN NaN NaN Email@company4.com,Email@company4.com NaN NaN NaN NaN NaN NaN
13 13 MM NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
14 14 NN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
You then have a duplicate ID
column, but you can just delete that
del df['ID']