数据处理分析模块 Pandas（3）

1 NumPy索引切片玫瑰花操作

import numpy as np
import matplotlib.pyplot as plt

img = plt.imread('./rose.jpg')
img.shape # 高、宽度、颜色
# 615高度像素
# 650宽度像素
# 3颜色通道：红绿蓝
img

输出：

array([[[246, 246, 246],
        [246, 246, 246],
        [246, 246, 246],
        ...,
        [246, 246, 246],
        [246, 246, 246],
        [246, 246, 246]],

       [[246, 246, 246],
        [246, 246, 246],
        [246, 246, 246],
        ...,
        [246, 246, 246],
        [246, 246, 246],
        [246, 246, 246]],

       [[246, 246, 246],
        [246, 246, 246],
        [246, 246, 246],
        ...,
        [246, 246, 246],
        [246, 246, 246],
        [246, 246, 246]],

       ...,

       [[246, 246, 246],
        [246, 246, 246],
        [246, 246, 246],
        ...,
        [246, 246, 246],
        [246, 246, 246],
        [246, 246, 246]],

       [[246, 246, 246],
        [246, 246, 246],
        [246, 246, 246],
        ...,
        [246, 246, 246],
        [246, 246, 246],
        [246, 246, 246]],

       [[246, 246, 246],
        [246, 246, 246],
        [246, 246, 246],
        ...,
        [246, 246, 246],
        [246, 246, 246],
        [246, 246, 246]]], dtype=uint8)

img.ndim

输出：3

plt.imshow(img)

# 红色玫瑰
# 蓝色妖姬，贵
# 红绿蓝 ---> 蓝绿红
plt.imshow(img[:,:,::-1]) #前两个#代表原本红绿蓝，后面为从新设置的颜色顺序，倒序

# # 红绿蓝 ---> 绿红蓝
# 花式索引
plt.imshow(img[:,:,[1,0,2]]) #前两个#代表原本红绿蓝，后面为从新设置的颜色顺序

2 pandas

2.1 数据结构

* Python在数据处理和准备方面一直做得很好，但在数据分析和建模方面就差一些。pandas帮助填补了这一空白，使您能够在Python中执行整个数据分析工作流程，而不必切换到更特定于领域的语言，如R。
* 与出色的 jupyter工具包和其他库相结合，Python中用于进行数据分析的环境在性能、生产率和协作能力方面都是卓越的。
* pandas是 Python 的核心数据分析支持库，提供了快速、灵活、明确的数据结构，旨在简单、直观地处理关系型、标记型数据。pandas是Python进行数据分析的必备高级工具。
* pandas的主要数据结构是 **Series(**一维数据)与 **Dataframe **(二维数据)，这两种数据结构足以处理金融、统计、社会科学、工程等领域里的大多数案例
* 处理数据一般分为几个阶段：数据整理与清洗、数据分析与建模、数据可视化与制表，Pandas 是处理数据的理想工具。
* pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple

一维结构Series

import pandas as pd

s = pd.Series(data = [0,3,5,7],index = ['a','b','c','d'])

# 一维Series之前NumPy（自然索引0~n）不同点？区别在于索引，一一对应
s = pd.Series(data = [0,3,5,7]) # 不指定索引，默认

输出：

0    0
1    3
2    5
3    7
dtype: int64

二维结构Dataframe

import pandas as pd

import numpy as np

# Excel 类似
pd.Dataframe(data = np.random.randint(0,150,size = (5,3)),
             columns=['Python','En','Math'],
             index = list('ABCDE'),dtype=np.float32)

输出：

Python En Math
A 140.0 129.0 49.0
B 66.0 120.0 134.0
C 19.0 47.0 35.0
D 84.0 103.0 32.0
E 103.0 9.0 91.0

	Python	En	Math
A	140.0	129.0	49.0
B	66.0	120.0	134.0
C	19.0	47.0	35.0
D	84.0	103.0	32.0
E	103.0	9.0	91.0

list('ABCDE')

输出：['A', 'B', 'C', 'D', 'E']

import pandas as pd

import numpy as np

# Excel 类似
# 创建Dataframe第二种方式，字典中的key作为列索引
df = pd.Dataframe(data = {'Python':np.random.randint(100,150,size = 5),
                     'En':np.random.randint(0,150,size = 5),
                     'Math':np.random.randint(0,150,size = 5)},index = list('ABCDE'))
#df.sort_index(ascending=False)#ascending=false则将行索引倒序
# index不设置，则默认自增
df
输出：

  Python En	Math
A	102	70	107
B	116	6	107
C	142	87	45
D	132	145	55
E	130	72	51

2.2 数据查看

import numpy as np
import pandas as pd
# 创建 shape(150,3)的二维标签数组结构Dataframe
df = pd.Dataframe(data = np.random.randint(0,151,size = (150,3)),
                   index = None,# 行索引默认
                   columns=['Python','Math','En'])# 列索引

df['Python'] = df['Python'].astype(np.int16)#可以用这个函数更改数据类型
# 查看其属性、概览和统计信息
df.head(10) # 显示头部10行，默认5个
df.tail() # 显示末尾10行，默认5个
df.shape # 查看形状，行数和列数
df.dtypes # 查看数据类型
df.index # 行索引
df.columns # 列索引
df.values # 对象值，二维ndarray数组
df.describe() # 查看数值型列的汇总统计,计数、平均值、标准差、最小值、四分位数、最大值
df.info() # 查看列索引、数据类型、非空计数和内存信息

2.3 数据输入，保存和输出

import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,50,size = [5,5]), # 薪资情况
               columns=['IT','化工','生物','教师','士兵'],index = list('ABCDE'))
display(df)
# 保存到当前路径下，文件命名是：salary.csv。csv逗号分割值文件格式
df.to_csv('./保存文件.csv',
          sep = ',', # 文本分隔符，默认是逗号
          header = True,# 是否保存列索引
          index = True) # 是否保存行索引，保存行索引，文件被加载时，默认行索引会作为一列

#输出：

  IT 化工	生物	教师	士兵
A	9	31	41	38	18
B	31	11	6	24	18
C	12	0	32	1	9
D	28	44	18	10	13
E	14	3	17	9	26

并将文件保存在目录

# 读取文件
pd.read_csv('./salary.csv',
            sep = ',',# 默认是逗号
            header = [0],#指定列索引
            index_col=0) # 指定行索引
输出：
	IT	化工	生物	教师	士兵
A	31	8	46	30	28
B	43	33	37	12	46
C	41	0	45	38	9
D	38	34	40	25	46
E	0	35	46	36	15

import numpy as np
import pandas as pd
df1 = pd.Dataframe(data = np.random.randint(0,50,size = [50,5]), # 薪资情况
               columns=['IT','化工','生物','教师','士兵'])
df2 = pd.Dataframe(data = np.random.randint(0,50,size = [150,3]),# 计算机科目的考试成绩
                   columns=['Python','Tensorflow','Keras'])

# 保存到当前路径下，文件命名是：salary.xls
df1.to_excel('./salary.xlsx',
            sheet_name = 'salary',# Excel中工作表的名字
            header = True,# 是否保存列索引
            index = False) # 是否保存行索引，保存行索引


#读取Excel文件
pd.read_excel('./salary.xlsx',
              sheet_name='salary',# 读取哪一个Excel中工作表，默认第一个
              header = 0,# 使用第一行数据作为列索引
              names = list('ABCDE'),# 替换列索引
              index_col = 3)# 指定行索引，B作为行索引
输出：
	A	B	C	E
D				
29	24	26	21	37
8	2	43	24	47
18	21	34	2	12
14	28	15	48	29
31	0	25	15	28
48	20	9	8	3
16	37	0	16	16
41	39	41	8	19
37	1	19	36	42
23	23	42	16	5
6	17	25	9	20
38	42	35	25	31
46	13	28	9	39
12	22	42	38	29
42	41	47	35	2
46	25	45	17	6
39	31	20	47	8
9	36	48	29	46
48	19	9	29	33
34	18	39	0	12
34	15	7	16	34
19	11	34	30	43
1	37	8	42	47
9	7	28	16	28
13	37	37	5	16
48	36	39	38	20
17	27	41	28	10
33	5	6	25	30
11	33	2	20	23
46	36	32	32	18
26	6	36	11	34
24	37	44	17	7
19	13	21	35	15
22	22	36	22	34
19	25	32	44	2
11	48	15	16	10
1	17	17	32	3
15	12	8	46	5
38	40	17	45	47
21	0	44	41	46
28	11	12	19	42
37	26	32	27	1
33	18	24	42	33
5	17	7	26	41
44	18	23	19	7
20	41	10	2	29
49	10	15	20	49
9	30	2	49	22
41	43	4	39	41
18	30	38	46	13

向一个文件中存入多个表

# 一个Excel文件中保存多个工作表
with pd.ExcelWriter('./data.xlsx') as writer:
    df1.to_excel(writer,sheet_name='salary',index = False)
    df2.to_excel(writer,sheet_name='score',index = False)
pd.read_excel('./data.xlsx',
              sheet_name='score') # 读取Excel中指定名字的工作表
输出：

  Python Tensorflow	 Keras
0	43	26	11
1	35	14	20
2	18	38	13
3	46	12	27
4	28	20	6
...	...	...	...
145	37	49	12
146	36	28	10
147	38	20	31
148	11	21	39
149	45	41	27

mysql的保存读取

import pandas as pd
# SQLAlchemy是Python编程语言下的一款开源软件。提供了SQL工具包及对象关系映射（ORM）工具
from sqlalchemy import create_engine
df = pd.Dataframe(data = np.random.randint(0,50,size = [150,3]),# 计算机科目的考试成绩
                   columns=['Python','Tensorflow','Keras'])
# 数据库连接
conn = create_engine('mysql+pymysql://root:root@localhost/AIOT?charset=UTF8MB4')
# 保存到数据库
df.to_sql('data',#数据库中表名
          conn,index = False)# 数据库连接
      
#如果表名存在，追加数据
# 从数据库中加载
# pd.read_sql('select * from score limit 10', # sql查询语句
#             conn, # 数据库连接
#             index_col='Python') # 指定行索引名

2.4 数据选择

# 和NumPy的花式索引类似
df = pd.Dataframe(np.random.randint(0,150,size = (1000,3)),
                  columns = ['Python','En','Math'])
df
输出：
	Python	En	Math
0	9	53	115
1	39	47	130
2	19	79	15
3	17	41	69
4	134	145	149
...	...	...	...
995	137	4	53
996	50	2	19
997	54	18	146
998	81	116	36
999	24	53	40

#获取列
df['Python'] # Series
df.Python
df[['Python','Math']]
df[['En']]

输出：
	En
0	53
1	47
2	79
3	41
4	145
...	...
995	4
996	2
997	18
998	116
999	53

行获取

df2 = pd.Dataframe(np.random.randint(0,150,size = (5,3)),
                  index=list('ABCDE'),
                  columns=['Python','Math','En'])
df2
输出：

  Python Math En
A	5	61	66
B	122	85	0
C	105	142	86
D	129	41	139
E	54	43	79

df2.loc['A'] # 行索引，指定的A~E
df2.loc[['A','D']]
输出：

  Python Math En
A	5	61	66
D	129	41	139

df2.iloc[0] # 自然数索引 0 ~ n
df2.iloc[[0,3]]
输出：
	Python	Tensorflow	Keras
0	43	26	11
3	46	12	27

获取具体行数值

df2['Math']['B'] # 方法一：分开写，因为[]里面一起写，只支持列索引

# df2['Math','B']
输出：
85

df2.loc['B']['Math']#方法二

输出：
85

# loc 表示，先获取行，再获取列
df2.loc['B','Math']#方法三
输出：
85

# iloc 表示，先获取行，再获取列
df2.iloc[1,1]#方法四
输出：
85

df2.loc['A':'C','Math':]#使用切片获得行再获取相应成绩，方法一
输出：

   Math	En
A	1	25
B	25	41
C	59	135

df2.iloc[2:4,[0,-1]]#方法二
输出：
  Python En
C	105	86
D	129	139

3 数据筛选

3.1 数据操作

df=pd.Dataframe(np.random.randint(0,50,size=(20,3)),columns=['python','math','en'])
display(df)
输出：
  python math en
0	35	21	40
1	38	21	5
2	22	0	49
3	7	11	40
4	42	48	46
5	9	17	38
6	1	29	8
7	28	49	2
8	46	44	14
9	4	12	48
10	11	27	28
11	32	27	48
12	13	41	42
13	17	23	48
14	21	35	28
15	28	48	2
16	13	10	19
17	16	46	43
18	46	39	13
19	42	23	3

#增加一列
df['物理']=np.random.randint(0,50,size=20)
df
输出：

  python math en 物理
0	35	21	40	14
1	38	21	5	18
2	22	0	49	1
3	7	11	40	8
4	42	48	46	43
5	9	17	38	36
6	1	29	8	47
7	28	49	2	22
8	46	44	14	1
9	4	12	48	44
10	11	27	28	21
11	32	27	48	40
12	13	41	42	26
13	17	23	48	16
14	21	35	28	2
15	28	48	2	6
16	13	10	19	22
17	16	46	43	39
18	46	39	13	8
19	42	23	30	16

3.2 将python都增加10分

df['python']+=10
df
输出：
  python math en 物理
0	45	21	40	14
1	48	21	5	18
2	32	0	49	1
3	17	11	40	8
4	52	48	46	43
5	19	17	38	36
6	11	29	8	47
7	38	49	2	22
8	56	44	14	1
9	14	12	48	44
10	21	27	28	21
11	42	27	48	40
12	23	41	42	26
13	27	23	48	16
14	31	35	28	2
15	38	48	2	6
16	23	10	19	22
17	26	46	43	39
18	56	39	13	8
19	52	23	30	16

3.3 将math索引是2和3的，分数变成100

df['math'][2]=100#修改成功
df['math'][[2,3]]=100#修改成功
df
输出：
  python math en 物理
0	45	21	40	14
1	48	21	5	18
2	32	100	49	1
3	17	100	40	8
4	52	48	46	43
5	19	17	38	36
6	11	29	8	47
7	38	49	2	22
8	56	44	14	1
9	14	12	48	44
10	21	27	28	21
11	42	27	48	40
12	23	41	42	26
13	27	23	48	16
14	31	35	28	2
15	38	48	2	6
16	23	10	19	22
17	26	46	43	39
18	56	39	13	8
19	52	23	30	16

3.4 批量修改多个数据

df.loc[[2,3],['math','en']]=1024#注意两个中括号之间加‘，’
df
输出：
  python math en 物理
0	45	21	40	14
1	48	21	5	18
2	32	100	49	1
3	17	100	40	8
4	52	48	46	43
5	19	17	38	36
6	11	29	8	47
7	38	49	2	22
8	56	44	14	1
9	14	12	48	44
10	21	27	28	21
11	42	27	48	40
12	23	41	42	26
13	27	23	48	16
14	31	35	28	2
15	38	48	2	6
16	23	10	19	22
17	26	46	43	39
18	56	39	13	8
19	52	23	30	16

3.4 批量修改多个数据

df.loc[[2,3],['math','en']]=1024#注意两个中括号之间加‘，’
df
输出：
 python	math en	物理
0	45	21	40	14
1	48	21	5	18
2	32	1024	1024	1
3	17	1024	1024	8
4	52	48	46	43
5	19	17	38	36
6	11	29	8	47
7	38	49	2	22
8	56	44	14	1
9	14	12	48	44
10	21	27	28	21
11	42	27	48	40
12	23	41	42	26
13	27	23	48	16
14	31	35	28	2
15	38	48	2	6
16	23	10	19	22
17	26	46	43	39
18	56	39	13	8
19	52	23	30	16

条件情况下，列索引不能修改，生成的是复制数据

cond=df['物理']<10
#特别说明从原来数据中复制！！！
#两者没有关系
df[cond]-100
输出：
	python	math	en	物理
2	-68	924	924	-99
3	-83	924	924	-92
8	-44	-56	-86	-99
14	-69	-65	-72	-98
15	-62	-52	-98	-94
18	-44	-61	-87	-92

df[cond]#原数据
输出：

python	math	en	物理
2	32	1024	1024	1
3	17	1024	1024	8
8	56	44	14	1
14	31	35	28	2
15	38	48	2	6
18	56	39	13	8

如果是条件，用loc可以修改原数据

df.loc[cond]-=100
df[cond]
输出：

python	math	en	物理
2	-68	924	924	-99
3	-83	924	924	-92
8	-44	-56	-86	-99
14	-69	-65	-72	-98
15	-62	-52	-98	-94
18	-44	-61	-87	-92

4 数据集成

4.1 方式一concat

import pandas as pd
import numpy as np
df1 = pd.Dataframe(data = np.random.randint(0,150,size = [10,3]),# 计算机科目的考试成绩
                  index = list('ABCDEFGHIJ'),# 行标签，用户
                  columns=['Python','Tensorflow','Keras']) # 考试科目
df2 = pd.Dataframe(data = np.random.randint(0,150,size = [10,3]),# 计算机科目的考试成绩
                  index = list('KLMNOPQRST'),# 行标签，用户
                  columns=['Python','Tensorflow','Keras']) # 考试科目
df3 = pd.Dataframe(data = np.random.randint(0,150,size = (10,2)),
                  index = list('ABCDEFGHIJ'),
                  columns=['PyTorch','Paddle'])
display(df1,df2,df3)
#输出：

Python	Tensorflow	Keras
A	99	128	68
B	137	71	15
C	108	62	58
D	15	87	4
E	87	77	65
F	17	7	117
G	25	94	126
H	24	16	104
I	116	57	122
J	58	132	91
Python	Tensorflow	Keras
K	125	111	130
L	80	67	60
M	127	121	4
N	120	101	92
O	74	97	120
P	61	78	93
Q	93	33	40
R	32	110	65
S	23	20	47
T	105	7	3
PyTorch	Paddle
A	127	112
B	93	41
C	20	56
D	48	123
E	104	149
F	94	145
G	124	98
H	36	128
I	42	149
J	73	107

#合并时，列增加
pd.concat([df1,df3],axis=1)
输出：

Python	Tensorflow	Keras	PyTorch	Paddle
A	99	128	68	127	112
B	137	71	15	93	41
C	108	62	58	20	56
D	15	87	4	48	123
E	87	77	65	104	149
F	17	7	117	94	145
G	25	94	126	124	98
H	24	16	104	36	128
I	116	57	122	42	149
J	58	132	91	73	107

4.3 方式二插入

df = pd.Dataframe(data = np.random.randint(0,151,size = (10,3)),
                  index = list('ABCDEFGHIJ'),
                  columns = ['Python','Keras','Tensorflow'])
df.insert(loc = 1,column='Pytorch',value=1024) # 插入列
df
#输出：
Python	Pytorch	Keras	Tensorflow
A	63	1024	127	92
B	78	1024	41	45
C	123	1024	113	61
D	52	1024	135	129
E	37	1024	96	145
F	148	1024	52	23
G	50	1024	9	120
H	80	1024	115	117
I	64	1024	114	148
J	129	1024	54	93

df.insert(loc=2,column='music',value=150)
df
#输出：
Python	Pytorch	music	Keras	Tensorflow
A	63	1024	150	127	92
B	78	1024	150	41	45
C	123	1024	150	113	61
D	52	1024	150	135	129
E	37	1024	150	96	145
F	148	1024	150	52	23
G	50	1024	150	9	120
H	80	1024	150	115	117
I	64	1024	150	114	148
J	129	1024	150	54	93

4.4 在python后面插入一列

#1.获取列表索引名
#2.用list抓换成列表
#3.用index函数，获取列表指定字段的索引
#4.在该位置后加1，并赋值给变量

index=list(df.columns).index('Python')+1
df.insert(loc=index,column='spr',value=np.random.randint(0,151,size=10)
    )
df
#输出：
Python	spr	spor	sdd	sport	spoort	Pytorch	music	Keras	Tensorflow
A	63	112	20	200	200	200	1024	150	127	92
B	78	59	20	200	200	200	1024	150	41	45
C	123	96	20	200	200	200	1024	150	113	61
D	52	76	20	200	200	200	1024	150	135	129
E	37	137	20	200	200	200	1024	150	96	145
F	148	119	20	200	200	200	1024	150	52	23
G	50	127	20	200	200	200	1024	150	9	120
H	80	145	20	200	200	200	1024	150	115	117
I	64	9	20	200	200	200	1024	150	114	148
J	129	111	20	200	200	200	1024	150	54	93

4.5 Join SQL风格合并

# 表一中记录的是name和体重信息
df1 = pd.Dataframe(data = {'name':['softpo','Daniel','Brandon','Ella'],'weight':[70,55,75,65]})
# 表二中记录的是name和身高信息
df2 = pd.Dataframe(data = {'name':['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]})
df3 = pd.Dataframe(data = {'名字':['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]})
display(df1,df2,df3)
#输出：
name	weight
0	softpo	70
1	Daniel	55
2	Brandon	75
3	Ella	65
name	height
0	softpo	172
1	Daniel	170
2	Brandon	170
3	Cindy	166
名字	height
0	softpo	172
1	Daniel	170
2	Brandon	170
3	Cindy	166

df1体重和df2身高进行合并

pd.merge(df1,df2)#merge函数根据共同的属性进行合并
#这里共同的属性是name
#输出：
name	weight	height
0	softpo	70	172
1	Daniel	55	170
2	Brandon	75	170

#合并时共同属性取名不同，可以用left和right函数来统一
pd.merge(df1,df3,left_on='name',right_on='名字')

#输出：
   name	weight	名字	height
0	softpo	70	softpo	172
1	Daniel	55	Daniel	170
2	Brandon	75	Brandon	170

10名同学，计算每个人的平均分，合并

df4=pd.Dataframe(data=np.random.randint(0,151,size=(10,3)),
                 index=list('ABCDEFGHIJ'),
                 columns=['Python','Keras','Tensorflow'])

df4
#输出：
	Python	Keras	Tensorflow
A	38	48	68
B	96	2	118
C	69	103	55
D	135	86	13
E	25	109	65
F	32	29	63
G	110	13	77
H	18	111	19
I	91	47	53
J	38	3	103

#计算平均法
s=df4.mean(axis=1).round(1)
df5=pd.Dataframe(s,columns=['平均值'])
df5
#输出：

   平均值
A	51.3
B	72.0
C	75.7
D	78.0
E	66.3
F	41.3
G	66.7
H	49.3
I	63.7
J	48.0

#根据行索引进行的融合合并
#pd.merge(df4,df5,left_index=True,right_index=True)#左边的等于行索引，右边的行索引

#方法2
# df4.insert(loc=3,column='平均分',value=df5)
# df4

#方法三
pd.concat([df4,df5],axis=1)

#输出：
  ython	Keras	Tensorflow	平均值
A	38	48	68	51.3
B	96	2	118	72.0
C	69	103	55	75.7
D	135	86	13	78.0
E	25	109	65	66.3
F	32	29	63	41.3
G	110	13	77	66.7
H	18	111	19	49.3
I	91	47	53	63.7
J	38	3	103	48.0

5 数据清洗

df = pd.Dataframe(data = {'color':['red','blue','red','green','blue',None,'red'],
                          'price':[10,20,10,15,20,0,np.NaN]})
df
输出：

   color	price
0	red	    10.0
1	blue	20.0
2	red	    10.0
3	green	15.0
4	blue	20.0
5	None	0.0
6	red	NaN

# 1、重复数据过滤
df.duplicated() # 判断是否存在重复数据
df.drop_duplicates() # 删除重复数据
#输出：
	color	price
0	red	    10.0
1	blue	20.0
3	green	15.0
5	None	0.0
6	red	    NaN

# 2、空数据过滤
df.isnull() # 判断是否存在空数据，存在返回True，否则返回False
#输出：
	color	price
0	False	False
1	False	False
2	False	False
3	False	False
4	False	False
5	True	False
6	False	True

df.dropna(how = 'any') # 删除空数据
#输出：
  color	   price
0	red	    10.0
1	blue	20.0
2	red	    10.0
3	green	15.0
4	blue	20.0

df.fillna(value=1111) # 填充空数据
#输出：

   color   price
0	red	   10.0
1	blue	20.0
2	red	   10.0
3	green	15.0
4	blue	20.0
5	1111	0.0
6	red	   1111.0

指定行或列进行删除

# 3、指定行或者列过滤
del df['color'] # 直接删除某列
df
#输出：

   price
0	10.0
1	20.0
2	10.0
3	15.0
4	20.0
5	0.0
6	NaN

df.drop(labels = ['price'],axis = 1)# 删除指定列
#输出：

0
1
2
3
4
5
6

#inplace=True替换：替换原来的数据，修改原数据
df.drop(labels = [0,1,5],axis = 0) # 删除指定行
#输出：

   price
2	10.0
3	15.0
4	20.0
6	NaN

5.1 异常值的过滤

df2 = pd.Dataframe(data = np.random.randn(10000,3)) # 正态分布数据
# 3σ过滤异常值，σ即是标准差
# cond = (df2 > 3*df2.std()).any(axis = 1)
# index = df2[cond].index # 不满足条件的行索引
# df2.drop(labels=index,axis = 0) # 根据行索引，进行数据删除

df2.mean()
#输出：
0    0.000174
1   -0.010116
2   -0.003753
dtype: float64

df2.std()#标准差
#输出：
0    0.993306
1    0.988248
2    0.998491
dtype: float64

>3σ异常值， 휎σ表示标准差

#比较运算
#异常值为少数
cond=df2.abs()> 3 * df2.std()#abs()函数可以数据都变成正值
cond.sum()
#输出：
0    26
1    26
2    26
dtype: int64

df2[cond[0]]#获取第一列的异常
#输出：
       0	         1	       2
190	3.054631	-0.853145	-0.755782
403	-3.045133	0.244970	-0.971150
561	3.190410	-0.167448	-1.099278
760	3.024192	-0.215106	1.744897
993	3.111947	0.810609	0.453292
1473	-3.173945	0.287238	-1.145017
1619	3.683752	-1.215306	1.263824
1624	-3.271255	0.089374	-0.218108
1939	3.384108	-0.306942	0.147536
2261	-3.457696	1.776478	-1.511388
2530	3.348100	-1.556387	-0.513249
3550	-3.816976	-0.055152	0.512013
4090	3.241326	-0.596062	-0.465360
4786	-3.363480	1.094183	-1.343750
4882	3.230330	0.357062	-0.244336
5101	-3.449064	0.643582	0.644510
5403	-3.025487	-0.863910	-1.087042
5966	3.097428	-0.148177	0.562554
6179	3.038740	1.002613	-0.234714
6290	3.134905	-0.728645	0.693016
7447	-3.283206	-0.016250	0.935852
7781	3.079952	1.250359	-1.150737
8521	3.237060	1.311595	0.063898
8627	3.129588	-0.194632	-1.122567
8636	-3.022962	-0.338643	0.405656
8813	3.938810	-0.459422	-1.090727

#获取所有的异常值
#方法1
cond_0=cond[0]
cond_1=cond[1]
cond_2=cond[2]
cond_s=cond_0|cond_1|cond_2
df2[cond_s]
#输出：
       0	           1	   2
190	3.054631	-0.853145	-0.755782
403	-3.045133	0.244970	-0.971150
432	0.447692	0.531759	3.199177
451	-1.656428	0.023364	-3.265311
561	3.190410	-0.167448	-1.099278
...	...	...	...
9390	0.143425	-0.387227	-3.099210
9466	-0.085263	-3.458459	0.522463
9801	-0.231697	0.398771	3.093271
9854	0.705503	3.110385	-1.190745
9933	1.189320	-3.109043	-0.286718

#方法2
cond=df2.abs()> 3 * df2.std()

cond_=cond.any(axis=1)#any方法，条件中，只要一个为真，全部为真
df2[cond_]
#输出：
        0	          1      	2
190	3.054631	-0.853145	-0.755782
403	-3.045133	0.244970	-0.971150
432	0.447692	0.531759	3.199177
451	-1.656428	0.023364	-3.265311
561	3.190410	-0.167448	-1.099278
...	...	...	...
9390	0.143425	-0.387227	-3.099210
9466	-0.085263	-3.458459	0.522463
9801	-0.231697	0.398771	3.093271
9854	0.705503	3.110385	-1.190745
9933	1.189320	-3.109043	-0.286718

6 数据转换

6.1 轴和元素的替换

import numpy as np
import pandas as pd
df = pd.Dataframe(data = np.random.randint(0,10,size = (10,3)),
                  index = list('ABCDEFHIJK'),
                  columns=['Python','Tensorflow','Keras'])
df.iloc[4,2] = None # 空数据
df
#输出：

  Python Tensorflow	Keras
A	5	4	0.0
B	8	2	4.0
C	9	4	1.0
D	3	5	9.0
E	2	2	NaN
F	5	1	7.0
H	7	1	5.0
I	7	0	0.0
J	9	0	3.0
K	8	6	3.0

#1、重命名轴索引
df.rename(index = {'A':'AA','B':'BB'},columns = {'Python':'人工智能'}) 
#输出：
人工智能	Tensorflow	Keras
AA	5	4	0.0
BB	8	2	4.0
C	9	4	1.0
D	3	5	9.0
E	2	2	NaN
F	5	1	7.0
H	7	1	5.0
I	7	0	0.0
J	9	0	3.0
K	8	6	3.0

# 2、替换值
df.replace(3,1024) #只要是3替换为1024
df.replace([0,7],2048) # 将0和7替换为2048
df.replace({0:512,np.nan:998}) # 根据字典键值对进行替换，把0替换成512，空数据替换998
df.replace({'Python':2},-1024) # 将Python这一列中等于2的，替换为-1024

#输出：
	Python	Tensorflow	Keras
A	5	4	0.0
B	8	2	4.0
C	9	4	1.0
D	3	5	9.0
E	-1024	2	NaN
F	5	1	7.0
H	7	1	5.0
I	7	0	0.0
J	9	0	3.0
K	8	6	3.0

6.2 map Series元素改变

df = pd.Dataframe(data = np.random.randint(0,10,size = (10,3)),
                  index = list('ABCDEFHIJK'),
                  columns=['Python','Tensorflow','Keras'])
df
#输出：
 Python	Tensorflow	Keras
A	8	2	8
B	5	9	2
C	9	2	8
D	1	5	2
E	9	4	8
F	3	1	3
H	3	1	1
I	7	5	9
J	5	8	3
K	9	8	2

# 1、map批量元素改变，Series专有
df['Keras'].map({9:'Hello',2:'World',7:'AI'}) # 字典映射
#输出：
A       AI
B       AI
C    Hello
D    Hello
E    World
F      NaN
H      NaN
I      NaN
J      NaN
K       AI
Name: Keras, dtype: object

#方法2
def conver(x):
    if x==9:
        return'hellow'
    elif x==2:
        return 'world'
    elif x==7:
        return 'ai'
    else:
        return x
    
df['Keras'].map(conver)
#输出：
A        ai
B        ai
C    hellow
D    hellow
E     world
F         0
H         3
I         3
J         6
K        ai
Name: Keras, dtype: object

隐式函数

df['Python'].map(lambda x:100 if x >=5 else -10000) # 隐式函数映射
#输出：
A      100
B      100
C   -10000
D      100
E      100
F      100
H   -10000
I      100
J      100
K   -10000
Name: Python, dtype: int64

6.3  apply元素改变。既支持 Series，也支持 Dataframe

df = pd.Dataframe(data = np.random.randint(0,10,size = (10,3)),
                  index = list('ABCDEFHIJK'),
                  columns=['Python','Tensorflow','Keras'])
df.iloc[4,2] = None # 空数据
df
#输出：
Python	Tensorflow	Keras
A	5	0	8.0
B	4	9	3.0
C	6	4	2.0
D	0	8	0.0
E	9	5	NaN
F	6	0	6.0
H	6	7	8.0
I	6	5	8.0
J	6	2	5.0
K	3	3	9.0

def conver(x):
    if x < 6 :
        return '不及格'
    elif x<8:
        return '中等'
    else:
        return'优秀'

result=df['Python'].apply(conver)
result
#输出：
A    不及格
B    不及格
C     中等
D    不及格
E     优秀
F     中等
H     中等
I     中等
J     中等
K    不及格
Name: Python, dtype: object

#将上面列插入
index=list(df.columns).index('Python')+1
df.insert(loc=index,column='等级',value=result)
df
#输出：
	Python	等级	Python等级	Tensorflow	Keras
A	5	不及格	不及格	0	8.0
B	4	不及格	不及格	9	3.0
C	6	中等	中等	4	2.0
D	0	不及格	不及格	8	0.0
E	9	优秀	优秀	5	NaN
F	6	中等	中等	0	6.0
H	6	中等	中等	7	8.0
I	6	中等	中等	5	8.0
J	6	中等	中等	2	5.0
K	3	不及格	不及格	3	9.0

6.4 多行转换

def conver(x):
    if x < 6 :
        return '不及格'
    elif x<8:
        return '中等'
    else:
        return'优秀'

for col in ['Python','Tensorflow','Keras']:
    result=df[col].apply(conver)
    
    index=list(df.columns).index(col)+1
    df.insert(loc=index,column=col+'等级',value=result)
df
#输出：
  Python	Python等级	Tensorflow	Tensorflow等级	Keras	Keras等级	等级
A	8	    优秀	           2	       不及格	       8	优秀	不及格
B	5	   不及格	       9	        优秀         	2	不及格	不及格
C	9	   优秀	           2	        不及格	        8	优秀	中等
D	1	  不及格	           5	        不及格	        2	不及格	不及格
E	9	   优秀	           4	        不及格	        8	优秀	优秀
F	3	    不及格	       1	        不及格	        3 	不及格	中等
H	3	    不及格	       1	        不及格	       1	不及格	中等
I	7	     中等	       5	        不及格	       9	优秀	中等
J	5	    不及格          8	         优秀	       3	不及格	中等
K	9	     优秀	       8	         优秀	       2	不及格	不及格

数据处理分析模块 Pandas（3）

Python相关栏目本月热门文章