Pandas_Python

总览

1. 单级索引

2. 多级索引

3. 索引设定

4. 常用索引型函数

5. 重复数值处理

6. 抽样函数

细节

1. 单级索引

# loc方法
# 行操作
df.loc[1103]
df.loc[[1102,2304]]
df.loc[1304:2103].head() # loc的选择是左右都闭的
df.loc[2402::-1].head()
# 列操作
df.loc[:,'Height'].head()
df.loc[:,['Height','Math']].head()
df.loc[:,'Height':'Math'].head()
# 联合索引
df.loc[1102:2401:3,'Height':'Math'].head()
# 函数式操作
df.loc[lambda x:x['Gender']=='M'].head()
def f(x):
    return [1101,1103]
df.loc[f]
# 布尔索引
df.loc[df['Address'].isin(['street_7','street_4'])].head()
df.loc[[True if i[-1]=='4' or i[-1]=='7' else False for i in df['Address'].values]].head()



# iloc方法
# 行操作
df.iloc[3]
df.iloc[3:5]
# 列操作
df.iloc[:,3].head()
df.iloc[:,7::-2].head()
# 混合操作
df.iloc[3::4,7::-2].head()
# 函数式操作
df.iloc[lambda x:[3]].head()
# 实际上，传入iloc的参数只是整数，整数列表以及布尔列表，仔细理解下面values的作用
df.iloc[(df['School']=='S_1').values].head()




# []操作符（一般用于列选择以及对应的布尔索引，不建议在选行的时候使用）
# 对Series的操作
s[1101]  # 这里很特殊的是Series没有列，所以直接写行的名字可以索引出来
s[0:4]
s[lambda x: x.index[16::-6]]
s[s>80]
s_float[2:] # index设置为float的时候，进行如此索引的话比较的不是位置序号，而是float的数值
# 对df的操作
# df的行
df[1:2] # 值得注意的是，行使用这种方式进行索引的话只能写位置，不可以写行的名字，写名字是给列索引用的
row = df.index.get_loc(1102) # 除非使用这种方式，先获得一行的index再进行索引
df[row:row+1] # get_loc函数接受一个index值，并且返回这个值在index里面的实际的位置值
df[3:5]
# df的列
df['School'].head()
df[['School','Math']].head()
df['School':'Height'].head() # 这种方式进行索引是不对的
df[lambda x:['Math','Physics']].head()
df[df['Gender']=='F'].head()




# 布尔索引
# 与或非
df[(df['Gender']=='F')&(df['Address']=='street_2')].head()
df[(df['Math']>85)|(df['Address']=='street_7')].head()
df[~((df['Math']>75)|(df['Address']=='street_1'))].head()
# loc结合布尔索引
df.loc[df['Math']>60,df.columns=='Physics'].head()
# isin方法
df[df['Address'].isin(['street_1','street_4'])&df['Physics'].isin(['A','A+'])]
df[df[['Address','Physics']].isin({'Address':['street_1','street_4'],'Physics':['A','A+']}).all(1)]
# at和iat
df.at[1101,'School']
df.iat[0,0]




# 区间索引
# 建立区间
pd.interval_range(start=0,end=5)
pd.interval_range(start=0,periods=8,freq=5)
# 区间分类
math_interval = pd.cut(df['Math'],bins=[0,40,60,80,100])
df_i = df.join(math_interval,rsuffix='_interval')[['Math','Math_interval']].reset_index().set_index('Math_interval')
df_i.loc[65].head()
df_i.loc[[65,90]].head()
# 这一个索引方式并没有研究明白
df_i[df_i.index.astype('interval').overlaps(pd.Interval(70, 85))].head()

2. 多级索引

# 多级索引的创建
# 元组创建
tuples = [('A','a'),('A','b'),('B','a'),('B','b')]
mul_index = pd.MultiIndex.from_tuples(tuples, names=('Upper', 'Lower'))
pd.Dataframe({'Score':['perfect','good','fair','bad']},index=mul_index)

# zip函数创建
L1 = list('AABB')
L2 = list('abab')
tuples = list(zip(L1,L2))
mul_index = pd.MultiIndex.from_tuples(tuples, names=('Upper', 'Lower'))
pd.Dataframe({'Score':['perfect','good','fair','bad']},index=mul_index)

# Array创建
arrays = [['A','a'],['A','b'],['B','a'],['B','b']]
mul_index = pd.MultiIndex.from_tuples(arrays, names=('Upper', 'Lower'))
pd.Dataframe({'Score':['perfect','good','fair','bad']},index=mul_index)

# 笛卡尔积创建from_product
L1 = ['A','B']
L2 = ['a','b']
pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))

# 实际上我们使用最多的可能是使用某一个列进行索引的情况
# 指定df里面的列作为multiindex
df_using_mul = df.set_index(['Class','Address'])
df_using_mul.head()
# 也可以在最一开始文件导入的时候就直接写明白，index是什么
pd.read_csv('data/table.csv',index_col=['Address','School']).head()



# 多级索引的切片
# 一般切片
df_using_mul.sort_index().loc['C_2','street_5'] # 这里sort_index是对索引进行排序，不然的话loc会占用很多性能资源，不进行排序会引起性能的PerformanceWarning
# 这里应该注意的是，实际上这个set_index多级索引返回的是很多个五行一组的数据行的组合，一级索引内部是有很多重复的，使用sort_index之后，所有的一级索引内部都合并了
df_using_mul.loc['C_2','street_5']
df_using_mul.index.is_lexsorted()
df_using_mul.sort_index().index.is_lexsorted() # 这里的函数检验索引是不是已经排序了
# 不进行排序，甚至直接不能进行多层切片，下面代码直接报错
df_using_mul.loc[('C_2','street_5'):]
# 注意理解这里的多层切片的含义，排序之后，两级索引内部呈现了高度的有序性
df_using_mul.sort_index().loc[('C_2','street_6'):('C_3','street_4')]
# 这里选择C_3实际上代表了选中所有的C_3下面的元素
df_using_mul.sort_index().loc[('C_2','street_7'):'C_3'].head(20)
# 选择某几个元素
df_using_mul.sort_index().loc[[('C_2','street_7'),('C_3','street_2')]]
# 对一级，二级元素同时进行条件选择
df_using_mul.sort_index().loc[(['C_2','C_3'],['street_4','street_7']),:]
# 使用多级索引切片
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_s = pd.Dataframe(np.random.rand(9,9),index=mul_index1,columns=mul_index2)
df_s
idx=pd.IndexSlice
df_s.loc[idx['B':,df_s['D']['d']>0.3],idx[df_s.sum()>4]]
# 一定要深入理解上面这个索引实现的方式


# 多级索引的交换次序
df_using_mul.swaplevel(i=1,j=0,axis=0).sort_index().head()
df_muls = df.set_index(['School','Class','Address'])
df_muls.reorder_levels(['Address','School','Class'],axis=0).sort_index().head()

3. 索引设定

# 设置索引
df.reindex(index=[1101,1203,1206,2402])
df.reindex(columns=['Height','Gender','Average']).head()

Ref: https://github.com/yeayee/joyful-pandas 很棒的pandas教程，强烈推荐大家star这个项目，作者读过很多Pandas的英文原著。这一系列的文章前期主要是依据这个教程做一个cheat sheet, 后续会在此基础上添加更多其他书籍以及教程里面的常用函数以及代码技巧。

Pandas

Python相关栏目本月热门文章