再快一点
%%cython# using cython in jupyter notebook# in another cell run `%load_ext Cython`from collections import defaultdictimport numpy as npdef cg(x): cnt = defaultdict(lambda: 0) for j in x.tolist(): cnt[j] += 1 yield cnt[j]def fastcount(x): return [i for i in cg(x)]df1['cc'] = fastcount(df1.key.values)df2['cc'] = fastcount(df2.key.values)df1.merge(df2, how='outer').drop('cc', 1)更快的答案; 不可扩展
def fastcount(x): unq, inv = np.unique(x, return_inverse=1) m = np.arange(len(unq))[:, None] == inv return (m.cumsum(1) * m).sum(0)df1['cc'] = fastcount(df1.key.values)df2['cc'] = fastcount(df2.key.values)df1.merge(df2, how='outer').drop('cc', 1)旧答案
df1['cc'] = df1.groupby('key').cumcount()df2['cc'] = df2.groupby('key').cumcount()df1.merge(df2, how='outer').drop('cc', 1)


