Numpy基础
numpy的array创建
numpy属性
numpy的基础运算
numpy的索引
numpy的array合并
numpy的array分割
numpy的拷贝与深拷贝
Pandas基础
Dataframe基本介绍
pandas选择数据
pandas设置值
pandas处理丢失数据
pandas导入导出
pandas合并concat
pandas合并merge
numpy的array创建 import numpy as np a = np.array([1,2,3]) # [1, 2, 3] a = np.array([[1, 2], [3, 4]]) # dtype a = np.array([1, 2, 3], dtype = float)
numpy属性 # ndim shape size a = np.array([[1,2,3],[4,5,6]]) print (a.ndim) # 2 print (a.shape) # (2, 3) print (a.size) # 6# reshape b = a.reshape(3,2) # [[1 2] # [3 4] # [5 6]] a=np.arange(12).reshape((3,4)) # [[ 0 1 2 3] # [ 4 5 6 7] # [ 8 9 10 11]] # linspace a=np.linspace(1,10,5) # [ 1. 3.25 5.5 7.75 10. ]
numpy的基础运算 a = np.array([10,20,30]) b = np.arange(3) c = a-b # [10 19 28] c = b**2 # [0 1 4] c = 10*np.sin(a) # [-5.44021111 9.12945251 -9.88031624] print(b<2) # [ True True False] c = a*b # 数值相乘 # dot c = np.dot(a,b) # 矩阵相乘 c = a.dot(b) # random c = np.random.random((1,2)) # [[0.77461324 0.73686157]] # sum min max axis np.sum(a) # 求和 np.min(a,axis=1) # 在列中运算 np.max(a,axis=0) # 在行中运算 # argmin argmax a = np.arange(2,14).reshape(3,4) # [[ 2 3 4 5] # [ 6 7 8 9] # [10 11 12 13]] print(np.argmin(a)) # 最小值的索引 0 print(np.argmax(a)) # 最大值的索引 11 # mean median cumsum print(np.mean(a)) # 平均值 等同于np.average(a) print(a.mean()) print(np.median(a)) # 中位数 print(np.cumsum(a)) # 累加 [ 2 5 9 14 20 27 35 44 54 65 77 90] # diff print(np.diff(a)) # [[1 1 1] # [1 1 1] # [1 1 1]] # nonzero print(np.nonzero(a)) # (array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])) # sort a = np.arange(14,2,-1).reshape(3,4) print(a) # [[14 13 12 11] # [10 9 8 7] # [ 6 5 4 3]] print(np.sort(a)) # [[11 12 13 14] # [ 7 8 9 10] # [ 3 4 5 6]] # transpose print(np.transpose(a)) # 转置矩阵 print(a.T) # clip print(np.clip(a,5,9)) # [[9 9 9 9] # [9 9 8 7] # [6 5 5 5]]
numpy的索引 a = np.arange(3,15).reshape(3,4) # [[ 3 4 5 6] # [ 7 8 9 10] # [11 12 13 14]] print(a[2]) # [11 12 13 14] print(a[2][1]) # 12 print(a[2,1]) print(a[1,:]) # [ 7 8 9 10] print(a[1,1:3]) # [8 9] for x in a: print(x) #逐行打印 for x in a.T: print(x) #逐列打印 # flat for x in a.flat: print(x) #逐个打印 print(a.flatten()) # [ 3 4 5 6 7 8 9 10 11 12 13 14]
numpy的array合并 a = np.array([1,1,1]) b = np.array([2,2,2]) print(a.shape) # (3,) print(a.T.shape) # (3,) 一维不改变 print(a[np.newaxis,:].shape) # (1, 3) print(a[:,np.newaxis].shape) # (3, 1) # vstack print(np.vstack((a,b))) # 上下合并 # [[1 1 1] # [2 2 2]] # hstack print(np.hstack((a,b))) # 左右合并 # [1 1 1 2 2 2] # concatenate print(np.concatenate((a,b,b),axis=0)) # [1 1 1 2 2 2 2 2 2]
numpy的array分割 # split a = np.arange(12).reshape((3,4)) print(a) # [[ 0 1 2 3] # [ 4 5 6 7] # [ 8 9 10 11]] print(np.split(a,2,axis=1)) # axis=1 表示对列进行操作 # [array([[0, 1], # [4, 5], # [8, 9]]), array([[ 2, 3], # [ 6, 7], # [10, 11]])] print(np.split(a,3,axis=0)) # 横向均分成3部分 # [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])] # array_split print(np.array_split(a,3,axis=1)) # 纵向分成3部分 # [array([[0, 1], # [4, 5], # [8, 9]]), array([[ 2], # [ 6], # [10]]), array([[ 3], # [ 7], # [11]])] print(np.vsplit(a,3)) # 横向均分成3部分 print(np.hsplit(a,2)) # 横向均分成2部分
numpy的拷贝与深拷贝 a = np.arange(4) print(a) # [0 1 2 3] b=a a[0]=5 print(a) # [5 1 2 3] print(b) # [5 1 2 3] print(b is a) # True b=a.copy() a[0]=5 print(b) # [0 1 2 3] print(b is a) # False
Pandas基础
Dataframe基本介绍
import numpy as np
import pandas as pd
s = pd.Series([1,3,6,np.nan,44,1]) # 序列
print(s)
# 0 1.0
# 1 3.0
# 2 6.0
# 3 NaN
# 4 44.0
# 5 1.0
# dtype: float64
dates= pd.date_range('20200713',periods=6)
print(dates)
# DatetimeIndex(['2020-07-13', '2020-07-14', '2020-07-15', '2020-07-16',
# '2020-07-17', '2020-07-18'],
# dtype='datetime64[ns]', freq='D')
df = pd.Dataframe(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
# a b c d
# 2020-07-13 0.388863 -0.608913 1.749353 0.061047
# 2020-07-14 0.282084 0.489508 1.200999 0.290284
# 2020-07-15 1.056817 1.996520 -0.949814 -1.999452
# 2020-07-16 -0.858179 -0.422738 1.629874 -0.866620
# 2020-07-17 -0.075192 -1.691861 2.089265 -1.997765
# 2020-07-18 0.936046 1.039739 -0.169192 -0.586105
df2 = pd.Dataframe(np.arange(12).reshape(3,4))
print(df2)
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
df2 = pd.Dataframe({'A':1.,
'B':pd.Timestamp('20200713'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2)
# A B C D E F
# 0 1.0 2020-07-13 1.0 3 test foo
# 1 1.0 2020-07-13 1.0 3 train foo
# 2 1.0 2020-07-13 1.0 3 test foo
# 3 1.0 2020-07-13 1.0 3 train foo
print(df2.dtypes)
# A float64
# B datetime64[ns]
# C float32
# D int32
# E category
# F object
# dtype: object
print(df2.index)
# Int64Index([0, 1, 2, 3], dtype='int64')
print(df2.columns)
# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.values)
# [[1.0 Timestamp('2020-07-13 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-07-13 00:00:00') 1.0 3 'train' 'foo']
# [1.0 Timestamp('2020-07-13 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-07-13 00:00:00') 1.0 3 'train' 'foo']]
print(df2.describe()) # 数值分析
print(df2.T) # 转置
print(df2.sort_index(axis=0,ascending=False)) # ascending=False为倒序
# A B C D E F
# 3 1.0 2020-07-13 1.0 3 train foo
# 2 1.0 2020-07-13 1.0 3 test foo
# 1 1.0 2020-07-13 1.0 3 train foo
# 0 1.0 2020-07-13 1.0 3 test foo
print(df2.sort_values(by='E'))
# A B C D E F
# 0 1.0 2020-07-13 1.0 3 test foo
# 2 1.0 2020-07-13 1.0 3 test foo
# 1 1.0 2020-07-13 1.0 3 train foo
# 3 1.0 2020-07-13 1.0 3 train foo
pandas选择数据
dates=pd.date_range('20200713',periods=6)
df=pd.Dataframe(np.arange(24).reshape(6,4),index=dates,columns=['A', 'B', 'C', 'D'])
print(df['A'])
print(df.A)
print(df['20200713':'20200715'])
# A B C D
# 2020-07-13 0 1 2 3
# 2020-07-14 4 5 6 7
# 2020-07-15 8 9 10 11
print(df.loc['20200714',['A','B']])
# A 4
# B 5
# Name: 2020-07-14 00:00:00, dtype: int64
print(df.iloc[3:5,1:3])
# B C
# 2020-07-16 13 14
# 2020-07-17 17 18
print(df.ix[:3,['A','C']]) # 最新版已删除ix
print(df[df.A<8])
# A B C D
# 2020-07-13 0 1 2 3
# 2020-07-14 4 5 6 7
pandas设置值
dates=pd.date_range('20200713',periods=6)
df=pd.Dataframe(np.arange(24).reshape(6,4),index=dates,columns=['A', 'B', 'C', 'D'])
df.loc['20200713','B']=2222
df.iloc[0,2]=1111
df.B[df.A>4]=0
df['F']=np.nan
df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20200713',periods=6))
print(df)
# A B C D
# 2020-07-13 0 2222 1111 3
# 2020-07-14 4 5 6 7
# 2020-07-15 8 0 10 11
# 2020-07-16 12 0 14 15
# 2020-07-17 16 0 18 19
# 2020-07-18 20 0 22 23
pandas处理丢失数据
dates=pd.date_range('20200713',periods=6)
df=pd.Dataframe(np.arange(24).reshape(6,4),index=dates,columns=['A', 'B', 'C', 'D'])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
print(df)
# A B C D
# 2020-07-13 0 NaN 2.0 3
# 2020-07-14 4 5.0 NaN 7
# 2020-07-15 8 9.0 10.0 11
# 2020-07-16 12 13.0 14.0 15
# 2020-07-17 16 17.0 18.0 19
# 2020-07-18 20 21.0 22.0 23
print(df.dropna(axis=0,how='any')) # 清洗值 how={'any','all'}
A B C D
# 2020-07-15 8 9.0 10.0 11
# 2020-07-16 12 13.0 14.0 15
# 2020-07-17 16 17.0 18.0 19
# 2020-07-18 20 21.0 22.0 23
print(df.fillna(value=0)) # 填充补值
print(np.any(df.isnull())) # 检查空数据
pandas导入导出
常用格式:csv,excel,pickle
data=pd.read_csv('student.csv') # 读取
print(data)
# Student IDtnametagetgender
# 0 1100tKellyt22tFemale
# 1 1101tClot21tFemale
# 2 1102tTillyt22tFemale
# 3 1103tTonyt24tMale
# 4 1104tDavidt20tMale
data.to_pickle('student.pickle') # 保存,生成student.pickle文件
pandas合并concat # ignore_index df1=pd.Dataframe(np.ones((3,4))*0,columns=['a','b','c','d']) df2=pd.Dataframe(np.ones((3,4))*1,columns=['a','b','c','d']) df3=pd.Dataframe(np.ones((3,4))*2,columns=['a','b','c','d']) print(df1) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 print(df2) # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 print(pd.concat([df1,df2,df3],axis=0,ignore_index=True)) # ignore_index=True重新排序 # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 1.0 1.0 1.0 1.0 # 3 1.0 1.0 1.0 1.0 # 4 2.0 2.0 2.0 2.0 # 5 2.0 2.0 2.0 2.0 # join df1=pd.Dataframe(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3]) df2=pd.Dataframe(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4]) print(pd.concat([df1,df2])) # 默认join='outer' # a b c d e # 1 0.0 0.0 0.0 0.0 NaN # 2 0.0 0.0 0.0 0.0 NaN # 3 0.0 0.0 0.0 0.0 NaN # 2 NaN 1.0 1.0 1.0 1.0 # 3 NaN 1.0 1.0 1.0 1.0 # 4 NaN 1.0 1.0 1.0 1.0 print(pd.concat([df1,df2],join='inner')) # b c d # 1 0.0 0.0 0.0 # 2 0.0 0.0 0.0 # 3 0.0 0.0 0.0 # 2 1.0 1.0 1.0 # 3 1.0 1.0 1.0 # 4 1.0 1.0 1.0 # axis print(pd.concat([df1,df2],axis=1)) # a b c d b c d e # 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 # append print(df1.append(s1,ignore_index=True)) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 2.0 3.0 4.0
pandas合并merge
# on
left = pd.Dataframe({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.Dataframe({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
# key A B
# 0 K0 A0 B0
# 1 K1 A1 B1
# 2 K2 A2 B2
# 3 K3 A3 B3
print(right)
# key C D
# 0 K0 C0 D0
# 1 K1 C1 D1
# 2 K2 C2 D2
# 3 K3 C3 D3
print(pd.merge(left,right,on='key'))
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K2 A2 B2 C2 D2
# 3 K3 A3 B3 C3 D3
# how
left = pd.Dataframe({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.Dataframe({'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
# key1 key2 A B
# 0 K0 K0 A0 B0
# 1 K0 K1 A1 B1
# 2 K1 K0 A2 B2
# 3 K2 K1 A3 B3
print(right)
# key1 key2 C D
# 0 K0 K0 C0 D0
# 1 K1 K0 C1 D1
# 2 K1 K0 C2 D2
# 3 K2 K0 C3 D3
print(pd.merge(left,right,on=['key1','key2'])) #默认inner how={'left','right','inner','outer'}
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
print(pd.merge(left,right,on=['key1','key2'],how='left'))
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K0 K1 A1 B1 NaN NaN
# 2 K1 K0 A2 B2 C1 D1
# 3 K1 K0 A2 B2 C2 D2
# 4 K2 K1 A3 B3 NaN NaN
print(pd.merge(left,right,on=['key1','key2'],how='right'))
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
# 3 K2 K0 NaN NaN C3 D3
print(pd.merge(left,right,on=['key1','key2'],how='outer'))
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K0 K1 A1 B1 NaN NaN
# 2 K1 K0 A2 B2 C1 D1
# 3 K1 K0 A2 B2 C2 D2
# 4 K2 K1 A3 B3 NaN NaN
# 5 K2 K0 NaN NaN C3 D3
# indicator
df1 = pd.Dataframe({'col1':[0,1],'col_left':['a','b']})
df2 = pd.Dataframe({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
# col1 col_left
# 0 0 a
# 1 1 b
print(df2)
# col1 col_right
# 0 1 2
# 1 2 2
# 2 2 2
print(pd.merge(df1,df2,on='col1',how='outer',indicator=True)) # indicator=True显示合并数据来源
# col1 col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
print(pd.merge(df1,df2,on='col1',how='outer',indicator=False))
# col1 col_left col_right
# 0 0 a NaN
# 1 1 b 2.0
# 2 2 NaN 2.0
# 3 2 NaN 2.0
# left_index
left = pd.Dataframe({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.Dataframe({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
# A B
# K0 A0 B0
# K1 A1 B1
# K2 A2 B2
print(right)
# C D
# K0 C0 D0
# K2 C2 D2
# K3 C3 D3
print(pd.merge(left,right,left_index=True,right_index=True,how='outer'))
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3
print(pd.merge(left,right,left_index=True,right_index=True,how='inner'))
# A B C D
# K0 A0 B0 C0 D0
# K2 A2 B2 C2 D2
# suffixes
boys = pd.Dataframe({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.Dataframe({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys)
# k age
# 0 K0 1
# 1 K1 2
# 2 K2 3
print(girls)
# k age
# 0 K0 4
# 1 K0 5
# 2 K3 6
print(pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner'))
# k age_boy age_girl
# 0 K0 1 4
# 1 K0 1 5


