pandas的联系网页不能扒下来啊,所以练习的时候需要自己根据结果创建一个csvjsonexcel数据库等文件喽。
创建数据帧、连接、合并
import pandas as pd
#创建数据帧
dataframe = pd.Dataframe()
#增加数据
dataframe['name'] = ['zhangqi','liufen','zhouchaiyao']
dataframe['age'] = [25,22,23]
dataframe['driver'] = [True,True,False]
#查看帧
print(dataframe)
print('--------------------------------------')
#创建数据帧
data1 = {
'id':['1','2','3'],
'name':['lex','parse','semanter'],
'age':[30,29,28]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
#创建数据帧
data2 = {
'id':['9','8','7'],
'name':['ying','die','you'],
'age':[22,22,22]
}
#创建数据帧
dataframe_2 = pd.Dataframe(data2,columns=['id','name','age'])
#创建一行
r = pd.Series([13,'lover','15'],index=['id','name','age'])
#沿着行的方向连接两个数据帧
print(pd.concat([dataframe_1,dataframe_2],axis=0))
#沿着列的方向连接两个数据帧
print(pd.concat([dataframe_1,dataframe_2],axis=1))
print('--------------------------------------')
#合并两个数据帧
print(pd.merge(dataframe_1,dataframe_2,on='id'))
print('--------------------------------------')
#合并两个数据帧
print(pd.merge(dataframe_1,dataframe_2,on='id',how='outer'))
print('--------------------------------------')
#合并两个数据帧
print(pd.merge(dataframe_1,dataframe_2,on='id',how='left'))
print('--------------------------------------')
#合并两个数据帧
print(pd.merge(dataframe_1,dataframe_2,on='id',how='right'))
print('--------------------------------------')
name age driver 0 zhangqi 25 True 1 liufen 22 True 2 zhouchaiyao 23 False -------------------------------------- id name age 0 1 lex 30 1 2 parse 29 2 3 semanter 28 0 9 ying 22 1 8 die 22 2 7 you 22 id name age id name age 0 1 lex 30 9 ying 22 1 2 parse 29 8 die 22 2 3 semanter 28 7 you 22 -------------------------------------- Empty Dataframe Columns: [id, name_x, age_x, name_y, age_y] Index: [] -------------------------------------- id name_x age_x name_y age_y 0 1 lex 30.0 NaN NaN 1 2 parse 29.0 NaN NaN 2 3 semanter 28.0 NaN NaN 3 9 NaN NaN ying 22.0 4 8 NaN NaN die 22.0 5 7 NaN NaN you 22.0 -------------------------------------- id name_x age_x name_y age_y 0 1 lex 30 NaN NaN 1 2 parse 29 NaN NaN 2 3 semanter 28 NaN NaN -------------------------------------- id name_x age_x name_y age_y 0 9 NaN NaN ying 22 1 8 NaN NaN die 22 2 7 NaN NaN you 22 --------------------------------------游览数据帧
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3'],
'name':['lex','parse','semanter'],
'age':[30,29,28]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
#查看第3行
print(dataframe_1.iloc[2])
print('---------------------------------------')
#查看后两行
print(dataframe_1.iloc[1:])
print('---------------------------------------')
#将非数字设为索引
mmmm = dataframe_1.set_index(dataframe_1['name'])
print(mmmm)
print('---------------------------------------')
print(mmmm.loc['lex']) #查字符串用loc
id 3
name semanter
age 28
Name: 2, dtype: object
---------------------------------------
id name age
1 2 parse 29
2 3 semanter 28
---------------------------------------
id name age
name
lex 1 lex 30
parse 2 parse 29
semanter 3 semanter 28
---------------------------------------
id 1
name lex
age 30
Name: lex, dtype: object
根据条件来选择行数据
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','5','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,42,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
#name = 'lex'
print(dataframe_1[dataframe_1['name'] == 'lex'].head(3))
#name='lex',age>4用id会出错
print(dataframe_1[ (dataframe_1['name']=='lex') & (dataframe_1['age']>4) ].head(3))
id name age 0 1 lex 30 4 5 lex 42 id name age 0 1 lex 30 4 5 lex 42替换数据、重命名列
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','5','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,42,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
#替换
print(dataframe_1['name'].replace('lex','sex').head(13))
#重命名
print(dataframe_1.rename(columns={'id':'primary id'}).head(15))
0 sex 1 parse 2 semanter 3 w 4 sex 5 e Name: name, dtype: object primary id name age 0 1 lex 30 1 2 parse 29 2 3 semanter 28 3 4 w 27 4 5 lex 42 5 6 e 12最值、平均值、总和、计数
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','5','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,42,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
print(dataframe_1['id'].max())
print(dataframe_1['id'].min())
print(dataframe_1['id'].mean()) #err
print(dataframe_1['id'].sum()) #err
print(dataframe_1['id'].count())
print(dataframe_1['age'].max())
print(dataframe_1['age'].min())
print(dataframe_1['age'].mean())
print(dataframe_1['age'].sum())
print(dataframe_1['age'].count())
6 1 20576.0 123456 6 42 12 28.0 168 6查找唯一值
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','5','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,42,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
print(dataframe_1['name'].unique())
['lex' 'parse' 'semanter' 'w' 'e']处理缺失
import pandas as pd
dataframe = pd.read_excel('test.xls')
print(dataframe.head(10))
print('----------------------')
print(dataframe[ dataframe['年龄'].isnull() ].head(3))
print('----------------------')
import numpy as np
#替换
dataframe['年龄']=dataframe['年龄'].replace(np.nan,7) #将NaN替换成了7
print(dataframe.head(10))
姓名 年龄 电话 住址 忌日
0 100 0.0 20.0 3 NONE
1 99 0.0 21.0 3 YES
2 98 -10.0 23.0 3 NONE
3 96 -100.0 NaN 3 NIL
4 95 NaN 12.0 3 OL
5 94 -10000.0 4.0 33 OK
----------------------
姓名 年龄 电话 住址 忌日
4 95 NaN 12.0 3 OL
----------------------
姓名 年龄 电话 住址 忌日
0 100 0.0 20.0 3 NONE
1 99 0.0 21.0 3 YES
2 98 -10.0 23.0 3 NONE
3 96 -100.0 NaN 3 NIL
4 95 7.0 12.0 3 OL
5 94 -10000.0 4.0 33 OK
test.xls文件
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','5','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,42,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
#删除一行
print(dataframe_1[ dataframe_1['name'] != 'lex' ].head(10))
#删除一列
print(dataframe_1.drop('age',axis=1).head(10))
id name age 1 2 parse 29 2 3 semanter 28 3 4 w 27 5 6 e 12 id name 0 1 lex 1 2 parse 2 3 semanter 3 4 w 4 5 lex 5 6 e删除重复行
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','1','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,30,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
print(dataframe_1.drop_duplicates().head(10))
id name age 0 1 lex 30 1 2 parse 29 2 3 semanter 28 3 4 w 27 5 6 e 12按值对行分组、对所有分组应用一个函数
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','5','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,42,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
print('----------------------------------------')
#按name进行分组,计算平均值
print(dataframe_1.groupby('name').mean())
print('----------------------------------------')
#按name进行分组,对所有组应用一个函数
func = lambda x:x.count()
print(dataframe_1.groupby('name').apply(func))
print('----------------------------------------')
----------------------------------------
age
name
e 12.0
lex 36.0
parse 29.0
semanter 28.0
w 27.0
----------------------------------------
id name age
name
e 1 1 1
lex 2 2 2
parse 1 1 1
semanter 1 1 1
w 1 1 1
----------------------------------------
遍历列,对一列应用某个函数
import pandas as pd
#创建数据帧
data1 = {
'id':['1','2','3','4','5','6'],
'name':['lex','parse','semanter','w','lex','e'],
'age':[30,29,28,27,42,12]
}
#创建数据帧
dataframe_1 = pd.Dataframe(data1,columns=['id','name','age'])
#遍历列3,4
for name in dataframe_1['name'][3:5]:
print(name.upper())#大写
#对一列应用某个函数
func=lambda x:x+'@demllie'
print(dataframe_1['name'].apply(func)[0:10])
W LEX 0 lex@demllie 1 parse@demllie 2 semanter@demllie 3 w@demllie 4 lex@demllie 5 e@demllie Name: name, dtype: object



