pands-进阶（4）

## 第八部分数据转换

### 第一节轴和元素替换

# 2、替换值
df.replace(3,1024) #将3替换为1024
df.replace([0,7],2048) # 将0和7替换为2048
df.replace({0:512,np.nan:998}) # 根据字典键值对进行替换
df.replace({'Python':2},-1024) # 将Python这一列中等于2的，替换为-1024
```

### 第二节 map Series元素改变

# 1、map批量元素改变，Series专有
df['Keras'].map({1:'Hello',5:'World',7:'AI'}) # 字典映射
df['Python'].map(lambda x:True if x >=5 else False) # 隐式函数映射
def convert(x): # 显示函数映射
if x%3 == 0:
return True
elif x%3 == 1:
return False
df['Tensorflow'].map(convert)
```

### 第三节 apply元素改变。既支持 Series，也支持 Dataframe

```python
import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,10,size = (10,3)),
index = list('ABCDEFHIJK'),
columns=['Python','Tensorflow','Keras'])
df.iloc[4,2] = None # 空数据
# 1、apply 应用方法数据转换，通用
# Series，其中x是Series中元素
df['Keras'].apply(lambda x:True if x >5 else False)
# Dataframe，其中的x是Dataframe中列或者行，是Series
df.apply(lambda x : x.median(),axis = 0) # 列的中位数
def convert(x): # 自定义方法
return (x.mean().round(1),x.count())
df.apply(convert,axis = 1) # 行平均值，计数

# 2、applymap Dataframe专有
df.applymap(lambda x : x + 100) # 计算Dataframe中每个元素
```

### 第四节 transform变形金刚

# 1、一列执行多项计算
df['Python'].transform([np.sqrt,np.exp]) # Series处理
def convert(x):
if x.mean() > 5:
x *= 10
else:
x *= -10
return x
# 2、多列执行不同计算
df.transform({'Python':convert,'Tensorflow':np.max,'Keras':np.min}) # Dataframe处理
```

### 第五节重排随机抽样哑变量

```python
import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,10,size = (10,3)),
index = list('ABCDEFHIJK'),
columns=['Python','Tensorflow','Keras'])

ran = np.random.permutation(10) # 随机重排
df.take(ran) # 重排Dataframe
df.take(np.random.randint(0,10,size = 15)) # 随机抽样

# 哑变量，独热编码，1表示有，0表示没有
df = pd.Dataframe({'key':['b','b','a','c','a','b']})
pd.get_dummies(df,prefix='',prefix_sep='')
```

## 第九部分数据重塑

```python
import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,100,size = (10,3)),
index = list('ABCDEFHIJK'),
columns=['Python','Tensorflow','Keras'])
df.T # 转置
df2 = pd.Dataframe(data = np.random.randint(0,100,size = (20,3)),
index = pd.MultiIndex.from_product([list('ABCDEFHIJK'),['期中','期末']]),#多层索引
columns=['Python','Tensorflow','Keras'])
df2.unstack(level = -1) # 行旋转成列，level指定哪一层，进行变换
df2.stack() # 列旋转成行
df2.stack().unstack(level = 1) # 行列互换

# 多层索引Dataframe数学计算
df2.mean() # 各学科平均分
df2.mean(level=0) # 各学科，每个人期中期末平均分
df2.mean(level = 1) # 各学科，期中期末所有人平均分
```

## 第十部分数学和统计方法

pandas对象拥有一组常用的数学和统计方法。它们属于汇总统计，对Series汇总计算获取mean、max值或者对Dataframe行、列汇总计算返回一个Series。

### 第一节简单统计指标

```python
import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,100,size = (20,3)),
index = list('ABCDEFHIJKLMNOPQRSTU'),
columns=['Python','Tensorflow','Keras'])
# 1、简单统计指标
df.count() # 统计非NA值的数量
df.max(axis = 0) #轴0最大值，即每一列最大值
df.min() #默认计算轴0最小值
df.median() # 中位数
df.sum() # 求和
df.mean(axis = 1) #轴1平均值，即每一行的平均值
df.quantile(q = [0.2,0.4,0.8]) # 分位数，
df.describe() # 查看数值型列的汇总统计,计数、平均值、标准差、最小值、四分位数、最大值
```

### 第二节索引标签、位置获取

```Python
# 2、索引位置
df['Python'].argmin() # 计算最小值位置
df['Keras'].argmax() # 最大值位置
df.idxmax() # 最大值索引标签（标签即非数字的索引）
df.idxmin() # 最小值索引标签
```

### 第三节更多统计指标

```Python
# 3、更多统计指标
df['Python'].value_counts() # 统计元素出现次数
df['Keras'].unique() # 去重
df.cumsum() # 累加
df.cumprod() # 累乘
df.std() # 标准差
df.var() # 方差
df.cummin() # 累计最小值
df.cummax() # 累计最大值
df.diff() # 计算差分
df.pct_change() # 计算百分比变化
```

### 第四节高级统计指标

```python
# 4、高级统计指标
df.cov() # 属性的协方差
df['Python'].cov(df['Keras']) # Python和Keras的协方差
df.corr() # 所有属性相关性系数
df.corrwith(df['Tensorflow']) # 单一属性相关性系数
```

协方差：$Cov(X,Y) = frac{sumlimits_1^n(X_i - overline{X})(Y_i - overline{Y})}{n-1}$

相关性系数：$r(X,Y) = frac{Cov(X,Y)}{sqrt{Var[X]Var[Y]}}$

## 第十一部分数据排序

```python
import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,30,size = (30,3)),
index = list('qwertyuioijhgfcasdcvbnerfghjcf'),
columns = ['Python','Keras','Pytorch'])
# 1、索引列名排序
df.sort_index(axis = 0,ascending=True) # 按索引排序，降序
df.sort_index(axis = 1,ascending=False) #按列名排序，升序
# 2、属性值排序
df.sort_values(by = ['Python']) #按Python属性值排序
df.sort_values(by = ['Python','Keras'])#先按Python，再按Keras排序

# 3、返回属性n大或者n小的值
df.nlargest(10,columns='Keras') # 根据属性Keras排序,返回最大10个数据
df.nsmallest(5,columns='Python') # 根据属性Python排序，返回最小5个数据
```

## 第十二部分分箱操作

分箱操作就是将连续数据转换为分类对应物的过程。比如将连续的身高数据划分为：矮中高。

分箱操作分为等距分箱和等频分箱。

分箱操作也叫面元划分或者离散化。

```python
import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,150,size = (100,3)),
columns=['Python','Tensorflow','Keras'])
# 1、等宽分箱
pd.cut(df.Python,bins = 3)
# 指定宽度分箱
pd.cut(df.Keras,#分箱数据
bins = [0,60,90,120,150],#分箱断点
right = False,# False左闭右开，true左闭右闭
labels=['不及格','中等','良好','优秀'])# 分箱后分类

# 2、等频分箱
pd.qcut(df.Python,q = 4,# 4等分
labels=['差','中','良','优']) # 分箱后分类
```

## 第十三部分分组聚合

![](./images/groupby.png)

### 第一节分组

```python
import numpy as np
import pandas as pd
# 准备数据
df = pd.Dataframe(data = {'sex':np.random.randint(0,2,size = 300), # 0男，1女
'class':np.random.randint(1,9,size = 300),#1~8八个班
'Python':np.random.randint(0,151,size = 300),#Python成绩
'Keras':np.random.randint(0,151,size =300),#Keras成绩
'Tensorflow':np.random.randint(0,151,size=300),
'Java':np.random.randint(0,151,size = 300),
'C++':np.random.randint(0,151,size = 300)})
df['sex'] = df['sex'].map({0:'男',1:'女'}) # 将0，1映射成男女
# 1、分组->可迭代对象
# 1.1 先分组再获取数据
g = df.groupby(by = 'sex')[['Python','Java']] # 单分组
for name,data in g:
print('组名：',name)
print('数据：',data)
df.groupby(by = ['class','sex'])[['Python']] # 多分组
# 1.2 对一列值进行分组
df['Python'].groupby(df['class']) # 单分组
df['Keras'].groupby([df['class'],df['sex']]) # 多分组
# 1.3 按数据类型分组
df.groupby(df.dtypes,axis = 1)
# 1.4 通过字典进行分组
m = {'sex':'category','class':'category','Python':'IT','Keras':'IT','Tensorflow':'IT','Java':'IT','C++':'IT'}
for name,data in df.groupby(m,axis = 1):
print('组名',name)
print('数据',data)
```

### 第二节分组聚合

```python
# 2、分组直接调用函数进行聚合
# 按照性别分组，其他列均值聚合
df.groupby(by = 'sex').mean().round(1) # 保留1位小数
# 按照班级和性别进行分组，Python、Keras的最大值聚合
df.groupby(by = ['class','sex'])[['Python','Keras']].max()
# 按照班级和性别进行分组，计数聚合。统计每个班，男女人数
df.groupby(by = ['class','sex']).size()
# 基本描述性统计聚合
df.groupby(by = ['class','sex']).describe()
```

### 第三节分组聚合apply、transform

![](./images/apply.png)

![](./images/transform.png)

```python
# 3、分组后调用apply，transform封装单一函数计算
# 返回分组结果
df.groupby(by = ['class','sex'])[['Python','Keras']].apply(np.mean).round(1)
def normalization(x):
return (x - x.min())/(x.max() - x.min()) # 最大值最小值归一化
# 返回全数据，返回Dataframe.shape和原Dataframe.shape一样。
df.groupby(by = ['class','sex'])[['Python','Tensorflow']].transform(normalization).round(3)
```

### 第四节分组聚合agg

![](./images/agg.png)

```python
# 4、agg 多中统计汇总操作
# 分组后调用agg应用多种统计汇总
df.groupby(by = ['class','sex'])[['Tensorflow','Keras']].agg([np.max,np.min,pd.Series.count])
# 分组后不同属性应用多种不同统计汇总
df.groupby(by = ['class','sex'])[['Python','Keras']].agg({'Python':[('最大值',np.max),('最小值',np.min)],
'Keras':[('计数',pd.Series.count),('中位数',np.median)]})
```

### 第五节透视表pivot_table

```python
# 5、透视表
# 透视表也是一种分组聚合运算
def count(x):
return len(x)
df.pivot_table(values=['Python','Keras','Tensorflow'],# 要透视分组的值
index=['class','sex'], # 分组透视指标
aggfunc={'Python':[('最大值',np.max)], # 聚合运算
'Keras':[('最小值',np.min),('中位数',np.median)],
'Tensorflow':[('最小值',np.min),('平均值',np.mean),('计数',count)]})
```

## 第十四部分时间序列

### 第一节时间戳操作

```python
# 1、创建方法
pd.Timestamp('2020-8-24 12')# 时刻数据
pd.Period('2020-8-24',freq = 'M') # 时期数据
index = pd.date_range('2020.08.24',periods=5,freq = 'M') # 批量时刻数据
pd.period_range('2020.08.24',periods=5,freq='M') # 批量时期数据
ts = pd.Series(np.random.randint(0,10,size = 5),index = index) # 时间戳索引Series

# 2、转换方法
pd.to_datetime(['2020.08.24','2020-08-24','24/08/2020','2020/8/24'])
pd.to_datetime([1598582232],unit='s')
dt = pd.to_datetime([1598582420401],unit = 'ms') # 世界标准时间
dt + pd.DateOffset(hours = 8) # 东八区时间
dt + pd.DateOffset(days = 100) # 100天后日期
```

### 第二节时间戳索引

```python
index = pd.date_range("2020-8-24", periods=200, freq="D")
ts = pd.Series(range(len(index)), index=index)
# str类型索引
ts['2020-08-30'] # 日期访问数据
ts['2020-08-24':'2020-09-3'] # 日期切片
ts['2020-08'] # 传入年月
ts['2020'] # 传入年
# 时间戳索引
ts[pd.Timestamp('2020-08-30')]
ts[pd.Timestamp('2020-08-24'):pd.Timestamp('2020-08-30')] # 切片
ts[pd.date_range('2020-08-24',periods=10,freq='D')]

# 时间戳索引属性
ts.index.year # 获取年
ts.index.dayofweek # 获取星期几
ts.index.weekofyear # 一年中第几个星期几
```

### 第三节时间序列常用方法

在做时间序列相关的工作时，经常要对时间做一些移动/滞后、频率转换、采样等相关操作，我们来看下这些操作如何使用

```python
index = pd.date_range('8/1/2020', periods=365, freq='D')
ts = pd.Series(np.random.randint(0, 500, len(index)), index=index)

# 1、移动
ts.shift(periods = 2) # 数据后移
ts.shift(periods = -2) # 数据前移

# 日期移动
ts.shift(periods = 2,freq = pd.tseries.offsets.Day()) # 天移动
ts.tshift(periods = 1,freq = pd.tseries.offsets.MonthOffset()) #月移动

# 2、频率转换
ts.asfreq(pd.tseries.offsets.Week()) # 天变周
ts.asfreq(pd.tseries.offsets.MonthEnd()) # 天变月
ts.asfreq(pd.tseries.offsets.Hour(),fill_value = 0) #天变小时，又少变多，fill_value为填充值

# 3、重采样
# resample 表示根据日期维度进行数据聚合，可以按照分钟、小时、工作日、周、月、年等来作为日期维度
ts.resample('2W').sum() # 以2周为单位进行汇总
ts.resample('3M').sum().cumsum() # 以季度为单位进行汇总

# 4、Dataframe重采样
d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
'volume': [50, 60, 40, 100, 50, 100, 40, 50],
'week_starting':pd.date_range('24/08/2020',periods=8,freq='W')})
df1 = pd.Dataframe(d)
df1.resample('M',on = 'week_starting').apply(np.sum)
df1.resample('M',on = 'week_starting').agg({'price':np.mean,'volume':np.sum})

days = pd.date_range('1/8/2020', periods=4, freq='D')
data2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
df2 = pd.Dataframe(data2,
index=pd.MultiIndex.from_product([days,['morning','afternoon']]))
df2.resample('D', level=0).sum()
```

### 第四节时区表示

```python
index = pd.date_range('8/1/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(index)), index)
import pytz
pytz.common_timezones # 常用时区
# 时区表示
ts = ts.tz_localize(tz='UTC')
# 转换成其它时区
ts.tz_convert(tz = 'Asia/Shanghai')
```

## 第十五部分数据可视化

pip install matplotlib -i https://pypi.tuna.tsinghua.edu.cn/simple

```python
import numpy as np
import pandas as pd

# 1、线形图
df1 = pd.Dataframe(data = np.random.randn(1000,4),
index = pd.date_range(start = '27/6/2012',periods=1000),
columns=list('ABCD'))
df1.cumsum().plot()

# 2、条形图
df2 = pd.Dataframe(data = np.random.rand(10,4),
columns = list('ABCD'))
df2.plot.bar(stacked = True) # stacked 是否堆叠

# 3、饼图
df3 = pd.Dataframe(data = np.random.rand(4,2),
index = list('ABCD'),
columns=['One','Two'])
df3.plot.pie(subplots = True,figsize = (8,8))

# 4、散点图
df4 = pd.Dataframe(np.random.rand(50, 4), columns=list('ABCD'))
df4.plot.scatter(x='A', y='B') # A和B关系绘制
# 在一张图中绘制AC散点图，同时绘制BD散点图
ax = df4.plot.scatter(x='A', y='C', color='DarkBlue', label='Group 1');
df4.plot.scatter(x='B', y='D', color='DarkGreen', label='Group 2', ax=ax)
# 气泡图，散点有大小之分
df4.plot.scatter(x='A',y='B',s = df4['C']*200)

# 5、面积图
df5 = pd.Dataframe(data = np.random.rand(10, 4),
columns=list('ABCD'))
df5.plot.area(stacked = True);# stacked 是否堆叠

# 6、箱式图
df6 = pd.Dataframe(data = np.random.rand(10, 5),
columns=list('ABCDE'))
df6.plot.box()

# 7、直方图
df7 = pd.Dataframe({'A': np.random.randn(1000) + 1, 'B': np.random.randn(1000),
'C': np.random.randn(1000) - 1})
df7.plot.hist(alpha=0.5) #带透明度直方图
df7.plot.hist(stacked = True)# 堆叠图
df7.hist(figsize = (8,8)) # 子视图绘制
```

pands-进阶（4）

Python相关栏目本月热门文章