这是一个用于计算频率分布的描述统计量的小函数:
# from __future__ import division (for Python 2)def descriptives_from_agg(values, freqs): values = np.array(values) freqs = np.array(freqs) arg_sorted = np.argsort(values) values = values[arg_sorted] freqs = freqs[arg_sorted] count = freqs.sum() fx = values * freqs mean = fx.sum() / count variance = ((freqs * values**2).sum() / count) - mean**2 variance = count / (count - 1) * variance # dof correction for sample variance std = np.sqrt(variance) minimum = np.min(values) maximum = np.max(values) cumcount = np.cumsum(freqs) Q1 = values[np.searchsorted(cumcount, 0.25*count)] Q2 = values[np.searchsorted(cumcount, 0.50*count)] Q3 = values[np.searchsorted(cumcount, 0.75*count)] idx = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] result = pd.Series([count, mean, std, minimum, Q1, Q2, Q3, maximum], index=idx) return result
演示:
np.random.seed(0)val = np.random.normal(100, 5, 1000).astype(int)pd.Series(val).describe()Out: count 1000.000000mean 99.274000std 4.945845min 84.00000025% 96.00000050% 99.00000075% 103.000000max 113.000000dtype: float64vc = pd.value_counts(val)descriptives_from_agg(vc.index, vc.values)Out: count 1000.000000mean 99.274000std 4.945845min 84.00000025% 96.00000050% 99.00000075% 103.000000max 113.000000dtype: float64
请注意,这不能处理NaN,并且未经适当测试。



