层次化索引
import numpy as np
import pandas as pd
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
a 1 1.222970
2 -0.235655
3 1.304342
b 1 1.415799
3 0.119285
c 1 2.678416
2 0.382792
d 2 -0.291932
3 -0.687770
dtype: float64
data['a']
1 1.222970
2 -0.235655
3 1.304342
dtype: float64
data['b':'c']
b 1 1.415799
3 0.119285
c 1 2.678416
2 0.382792
dtype: float64
data.loc[['b','d']]
b 1 1.415799
3 0.119285
d 2 -0.291932
3 -0.687770
dtype: float64
data.loc[:,2]
a -0.235655
c 0.382792
d -0.291932
dtype: float64
frame = pd.Dataframe({'a': range(7), 'b': range(7, 0, -1),
'c': ['one', 'one', 'one', 'two', 'two','two', 'two'],
'd': [0, 1, 2, 0, 1, 2, 3]})
frame
| a | b | c | d |
|---|
| 0 | 0 | 7 | one | 0 |
|---|
| 1 | 1 | 6 | one | 1 |
|---|
| 2 | 2 | 5 | one | 2 |
|---|
| 3 | 3 | 4 | two | 0 |
|---|
| 4 | 4 | 3 | two | 1 |
|---|
| 5 | 5 | 2 | two | 2 |
|---|
| 6 | 6 | 1 | two | 3 |
|---|
frame2 = frame.set_index(['c','d'])
frame2
| | a | b |
|---|
| c | d | | |
|---|
| one | 0 | 0 | 7 |
|---|
| 1 | 1 | 6 |
|---|
| 2 | 2 | 5 |
|---|
| two | 0 | 3 | 4 |
|---|
| 1 | 4 | 3 |
|---|
| 2 | 5 | 2 |
|---|
| 3 | 6 | 1 |
|---|
frame.set_index(['c','d'],drop=False)
| | a | b | c | d |
|---|
| c | d | | | | |
|---|
| one | 0 | 0 | 7 | one | 0 |
|---|
| 1 | 1 | 6 | one | 1 |
|---|
| 2 | 2 | 5 | one | 2 |
|---|
| two | 0 | 3 | 4 | two | 0 |
|---|
| 1 | 4 | 3 | two | 1 |
|---|
| 2 | 5 | 2 | two | 2 |
|---|
| 3 | 6 | 1 | two | 3 |
|---|
frame2.reset_index()
| c | d | a | b |
|---|
| 0 | one | 0 | 0 | 7 |
|---|
| 1 | one | 1 | 1 | 6 |
|---|
| 2 | one | 2 | 2 | 5 |
|---|
| 3 | two | 0 | 3 | 4 |
|---|
| 4 | two | 1 | 4 | 3 |
|---|
| 5 | two | 2 | 5 | 2 |
|---|
| 6 | two | 3 | 6 | 1 |
|---|
数据合并
left = pd.Dataframe({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.Dataframe({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
left
| A | B | key |
|---|
| 0 | A0 | B0 | K0 |
|---|
| 1 | A1 | B1 | K1 |
|---|
| 2 | A2 | B2 | K2 |
|---|
| 3 | A3 | B3 | K3 |
|---|
right
| C | D | key |
|---|
| 0 | C0 | D0 | K0 |
|---|
| 1 | C1 | D1 | K1 |
|---|
| 2 | C2 | D2 | K2 |
|---|
| 3 | C3 | D3 | K3 |
|---|
pd.merge(left,right)
| A | B | key | C | D |
|---|
| 0 | A0 | B0 | K0 | C0 | D0 |
|---|
| 1 | A1 | B1 | K1 | C1 | D1 |
|---|
| 2 | A2 | B2 | K2 | C2 | D2 |
|---|
| 3 | A3 | B3 | K3 | C3 | D3 |
|---|
pd.merge(left,right,on='key')
| A | B | key | C | D |
|---|
| 0 | A0 | B0 | K0 | C0 | D0 |
|---|
| 1 | A1 | B1 | K1 | C1 | D1 |
|---|
| 2 | A2 | B2 | K2 | C2 | D2 |
|---|
| 3 | A3 | B3 | K3 | C3 | D3 |
|---|
#处理重复列名
df_obj1 = pd.Dataframe({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data' : np.random.randint(0,10,7)})
df_obj2 = pd.Dataframe({'key': ['a', 'b', 'd'],
'data' : np.random.randint(0,10,3)})
df_obj1
| data | key |
|---|
| 0 | 1 | b |
|---|
| 1 | 7 | b |
|---|
| 2 | 1 | a |
|---|
| 3 | 2 | c |
|---|
| 4 | 8 | a |
|---|
| 5 | 5 | a |
|---|
| 6 | 0 | b |
|---|
df_obj2
print(pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right')))
data_left key data_right
0 1 b 4
1 7 b 4
2 0 b 4
3 1 a 0
4 8 a 0
5 5 a 0
#按索引连接
df_obj1 = pd.Dataframe({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.Dataframe({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
df_obj1
| data1 | key |
|---|
| 0 | 0 | b |
|---|
| 1 | 7 | b |
|---|
| 2 | 7 | a |
|---|
| 3 | 2 | c |
|---|
| 4 | 9 | a |
|---|
| 5 | 7 | a |
|---|
| 6 | 5 | b |
|---|
df_obj2
print(pd.merge(df_obj1, df_obj2, left_on='key', right_index=True))
data1 key data2
0 0 b 8
1 7 b 8
6 5 b 8
2 7 a 1
4 9 a 1
5 7 a 1
print(pd.merge(df_obj1, df_obj2, left_on='key', right_index=True))
data1 key data2
0 0 b 8
1 7 b 8
6 5 b 8
2 7 a 1
4 9 a 1
5 7 a 1
join
left2 = pd.Dataframe([[1., 2.], [3., 4.], [5., 6.]],
index=['a', 'c', 'e'],
columns=['语文', '数学'])
right2 = pd.Dataframe([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'],
columns=['英语', '综合'])
left2
| 语文 | 数学 |
|---|
| a | 1.0 | 2.0 |
|---|
| c | 3.0 | 4.0 |
|---|
| e | 5.0 | 6.0 |
|---|
right2
| 英语 | 综合 |
|---|
| b | 7.0 | 8.0 |
|---|
| c | 9.0 | 10.0 |
|---|
| d | 11.0 | 12.0 |
|---|
| e | 13.0 | 14.0 |
|---|
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
| 语文 | 数学 | 英语 | 综合 |
|---|
| a | 1.0 | 2.0 | NaN | NaN |
|---|
| b | NaN | NaN | 7.0 | 8.0 |
|---|
| c | 3.0 | 4.0 | 9.0 | 10.0 |
|---|
| d | NaN | NaN | 11.0 | 12.0 |
|---|
| e | 5.0 | 6.0 | 13.0 | 14.0 |
|---|
# 按照索引合并 但是要求没有重叠的列
left2.join(right2,how='outer')
| 语文 | 数学 | 英语 | 综合 |
|---|
| a | 1.0 | 2.0 | NaN | NaN |
|---|
| b | NaN | NaN | 7.0 | 8.0 |
|---|
| c | 3.0 | 4.0 | 9.0 | 10.0 |
|---|
| d | NaN | NaN | 11.0 | 12.0 |
|---|
| e | 5.0 | 6.0 | 13.0 | 14.0 |
|---|
pd.concat
df1 = pd.Dataframe(np.arange(6).reshape(3,2),index=list('abc'),columns=['one','two'])
df2 = pd.Dataframe(np.arange(4).reshape(2,2)+5,index=list('ac'),columns=['one','two'])
df1
df2
pd.concat([df1,df2]) #默认外连接
pd.concat([df1,df2],axis=1)
| one | two | one | two |
|---|
| a | 0 | 1 | 5.0 | 6.0 |
|---|
| b | 2 | 3 | NaN | NaN |
|---|
| c | 4 | 5 | 7.0 | 8.0 |
|---|
重塑和轴向旋转
重塑层次化索引
data = pd.Dataframe(np.arange(6).reshape((2, 3)),
index=pd.Index(['老王', '小刘'], name='姓名'),
columns=pd.Index(['语文', '数学', '英语'],name='科目'))
data
r = data.stack()
r
姓名 科目
老王 语文 0
数学 1
英语 2
小刘 语文 3
数学 4
英语 5
dtype: int32
type(r)
pandas.core.series.Series
r.unstack()
r.unstack(0)
r.unstack('姓名')
a1 = pd.Series(np.arange(4),index=list('abcd'))
a1
a 0
b 1
c 2
d 3
dtype: int32
a2 = pd.Series([4,5,6],index=list('cde'))
a2
c 4
d 5
e 6
dtype: int64
s1 = pd.concat([a1,a2],keys=['data1','data2'])
s1
data1 a 0
b 1
c 2
d 3
data2 c 4
d 5
e 6
dtype: int64
type(s1)
pandas.core.series.Series
s1.unstack()
| a | b | c | d | e |
|---|
| data1 | 0.0 | 1.0 | 2.0 | 3.0 | NaN |
|---|
| data2 | NaN | NaN | 4.0 | 5.0 | 6.0 |
|---|
# stack()默认过滤缺失数据
s1.unstack().stack()
data1 a 0.0
b 1.0
c 2.0
d 3.0
data2 c 4.0
d 5.0
e 6.0
dtype: float64
s1.unstack().stack(dropna=False)
data1 a 0.0
b 1.0
c 2.0
d 3.0
e NaN
data2 a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64
轴向旋转
df3 = pd.Dataframe({'date':['2018-11-22','2018-11-22','2018-11-23','2018-11-23','2018-11-24'],
'class':['a','b','b','c','c'],
'values':[5,3,2,6,1]},columns=['date','class','values'])
df3
| date | class | values |
|---|
| 0 | 2018-11-22 | a | 5 |
|---|
| 1 | 2018-11-22 | b | 3 |
|---|
| 2 | 2018-11-23 | b | 2 |
|---|
| 3 | 2018-11-23 | c | 6 |
|---|
| 4 | 2018-11-24 | c | 1 |
|---|
df3.pivot('date','class','values')
| class | a | b | c |
|---|
| date | | | |
|---|
| 2018-11-22 | 5.0 | 3.0 | NaN |
|---|
| 2018-11-23 | NaN | 2.0 | 6.0 |
|---|
| 2018-11-24 | NaN | NaN | 1.0 |
|---|
df3.pivot('date','class')
| values |
|---|
| class | a | b | c |
|---|
| date | | | |
|---|
| 2018-11-22 | 5.0 | 3.0 | NaN |
|---|
| 2018-11-23 | NaN | 2.0 | 6.0 |
|---|
| 2018-11-24 | NaN | NaN | 1.0 |
|---|
df3.set_index(['date','class']).unstack('class')
| values |
|---|
| class | a | b | c |
|---|
| date | | | |
|---|
| 2018-11-22 | 5.0 | 3.0 | NaN |
|---|
| 2018-11-23 | NaN | 2.0 | 6.0 |
|---|
| 2018-11-24 | NaN | NaN | 1.0 |
|---|