为了提高性能,可以使用
numexpr:
import numexprnp.random.seed(125)N = 40000000df = pd.Dataframe({'A':np.random.randint(10, size=N)})def ne(df): x = df.A.values return df[numexpr.evaluate('(x > 5)')]print (ne(df))In [138]: %timeit (ne(df))1 loop, best of 3: 494 ms per loopIn [139]: %timeit df[df.A > 5]1 loop, best of 3: 536 ms per loopIn [140]: %timeit df.query('A > 5')1 loop, best of 3: 781 ms per loopIn [141]: %timeit df[df.eval('A > 5')]1 loop, best of 3: 770 ms per loopimport numexprnp.random.seed(125)def ne(x): x = x.A.values return x[numexpr.evaluate('(x > 5)')]def be(x): return x[x.A > 5]def q(x): return x.query('A > 5')def ev(x): return x[x.eval('A > 5')]def make_df(n): df = pd.Dataframe(np.random.randint(10, size=n), columns=['A']) return dfperfplot.show( setup=make_df, kernels=[ne, be, q, ev], n_range=[2**k for k in range(2, 25)], logx=True, logy=True, equality_check=False, xlabel='len(df)')


