Fill The "na" Values With Unique "na" Identifier When Doing Pandas Merge
I want to merge two pandas dataframe. df1 = A B 2 11 2 13 2 15 2 19 2 25 2 35 2 41 2 47 2 46 2 51 3 9 3 15 3 17 3 23 3 25 3 29 5 4 5 23 5 2
Solution 1:
IIUC
New = df_update[df_update.C == 'na']
s=New.reset_index().groupby('A').apply(lambda x : x['index'].diff().ne(1)).cumsum()
df_update.loc[df_update.C == 'na','C']+='_'+s.astype(str).str.pad(2,fillchar='0').values
df_update
Out[124]:
A B C
0211 abc
1213 cdd
2215 na_01
3219 na_01
4225 na_01
5235 cdd
6241 cdd
7247 cdd
8246 na_02
9251 na_02
1039 cdd
11315 cdd
12317 cdd
13323 cdd
14325 na_03
15329 na_03
1654 na_04
17523 na_04
18528 na_04
Solution 2:
Attempt 1
def labels(d):
mask = d.C.isnull().values
a = d.A.values
c = d.C.values.copy()
i = np.flatnonzero(mask)
f, u = pd.factorize([
(a_, c_) for a_, c_ in zip(a[mask], (~mask).cumsum()[mask])
])
c[i] = [f'na_{g+1:02d}' for g in f]
return c
df1.merge(df2, 'left').assign(C=labels)
A B C
0211 abc
1213 cdd
2215 na_01
3219 na_01
4225 na_01
5235 cdd
6241 cdd
7247 cdd
8246 na_02
9251 na_02
1039 cdd
11315 cdd
12317 cdd
13323 cdd
14325 na_03
15329 na_03
1654 na_04
17523 na_04
18528 na_04
Attempt 2 also Python 3.6
def labeler():
tracker = {}
return lambda k: tracker.setdefault(k, len(tracker) + 1)
def fill(d):
c_ = labeler()
return [
f'na_{c_((a, g)):02d}' if pd.isna(c) else c
for a, c, g in zip(d.A, d.C, d.C.notna().cumsum())
]
df1.merge(df2, 'left').assign(C=fill)
A B C
0211 abc
1213 cdd
2215 na_01
3219 na_01
4225 na_01
5235 cdd
6241 cdd
7247 cdd
8246 na_02
9251 na_02
1039 cdd
11315 cdd
12317 cdd
13323 cdd
14325 na_03
15329 na_03
1654 na_04
17523 na_04
18528 na_04
Attempt 3 Another alternative. Not sure what I like better.
def labeler(d):
mask = d.C.notna()
csum = mask.cumsum()
tups = list(zip(d.A, csum, d.C, ~mask))
trac = dict(map(reversed, enumerate(
pd.unique([t[:2] for t in tups if t[-1]]), 1
)))
return list(map(
lambda t: f'na_{trac.get(t[:2]):02d}' if t[:2] in trac else t[2], tups
))
df1.merge(df2, 'left').assign(C=labeler)
A B C
0211 abc
1213 na_01
2215 na_01
3219 na_01
4225 na_01
5235 cdd
6241 cdd
7247 na_02
8246 na_02
9251 na_02
1039 cdd
11315 cdd
12317 cdd
13323 na_03
14325 na_03
15329 na_03
1654 na_04
17523 na_04
18528 na_04
Solution 3:
You can merge
first both DataFrame
s by left join and then for each group A
count NaN
s, which are replaced by fillna
:
df = df1.merge(df2, how='left')
isna = df['C'].isnull()
count_nans =(isna.ne(isna.groupby(df['A']).shift()) & isna).cumsum().astype(str).str.zfill(2)
df['C'] = df['C'].fillna('na_' + count_nans)
print (df)
A B C
0 2 11 abc
1 2 13 cdd
2 2 15 na_01
3 2 19 na_01
4 2 25 na_01
5 2 35 cdd
6 2 41 cdd
7 2 47 cdd
8 2 46 na_02
9 2 51 na_02
10 3 9 cdd
11 3 15 cdd
12 3 17 cdd
13 3 23 cdd
14 3 25 na_03
15 3 29 na_03
16 5 4 na_04
17 5 23 na_04
18 5 28 na_04
Post a Comment for "Fill The "na" Values With Unique "na" Identifier When Doing Pandas Merge"