python_Python

pandas

pandas的常用数据类型
- 1. Series 一维，带标签数组
- - (1). Serise的创建
  - (2). Series切片和索引
  - (3). Series的索引和值
  - (4). 读取外部数据
- 2. Dataframe 二维，Series容器
- - (1). Dataframe的基础属性
  - (2). Dataframe的整体情况查询
- 3. loc与iloc
- 4. 布尔索引
- 5. 字符串方法
- 6. 缺失数据的处理
- 7. 数据合并之join
- 8. 分组和聚合
- 9. 索引和复合索引
- - (1). Series的复合索引
  - (2). Dataframe的复合索引
- 10. 代码练习

numpy能够帮助我们处理数值，但是pandas除了处理数值之外(基于numpy)，还能够帮助我们处理其他类型的数据

pandas的常用数据类型 1. Series 一维，带标签数组 (1). Serise的创建

import string

import numpy as np
import pandas as pd

t=pd.Series(np.arange(10),index=list(string.ascii_uppercase[:10]))
print(t)
print(type(t))
a = {string.ascii_uppercase[i]:i for i in range(10)}  #字典推导式创建一个字典a
print(a)
print(pd.Series(a))
a = pd.Series(a,index=list(string.ascii_uppercase[5:15]))   #从6开始往后数，不行就为空
print(a)

运行结果:
A    0
B    1
C    2
D    3
E    4
F    5
G    6
H    7
I    8
J    9
dtype: int32

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9}
A    0
B    1
C    2
D    3
E    4
F    5
G    6
H    7
I    8
J    9
dtype: int64
F    5.0
G    6.0
H    7.0
I    8.0
J    9.0
K    NaN
L    NaN
M    NaN
N    NaN
O    NaN
dtype: float64

(2). Series切片和索引

简单的索引操作：
获取index：df.index
指定index ：df.index = [‘x’,‘y’]
重新设置index : df.reindex(list(“abcedf”))
指定某一列作为index ：df.set_index(“Country”,drop=False)
返回index的唯一值：df.set_index(“Country”).index.unique()

#coding=utf-8                                                                         
import string                                                                         
                                                                                      
import numpy as np                                                                    
import pandas as pd                                                                   
                                                                                      
t=pd.Series(np.arange(10),index=list(string.ascii_uppercase[:10]))                    
print(t[2:10:2]) #正常切片                                                                
print(t[[2,3,6]])  #两个中括号 意思是取     3，4，7行                                             
print(t[t>4])     #取t大于4的                                                             
print(t['F'])     #F对应的行信息     

运行结果:
C    2
E    4
G    6
I    8
dtype: int32
C    2
D    3
G    6
dtype: int32
F    5
G    6
H    7
I    8
J    9
dtype: int32
5

(3). Series的索引和值

import string           
                        
import numpy as np      
import pandas as pd     
                        
t=pd.Series(np.arange(10),index=list(string.ascii_uppercase[:10]))  
print(t.index)          
print(type(t.index))    
print(t.values)         
print(type(t.values))   

运行结果:
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')

[0 1 2 3 4 5 6 7 8 9]

(4). 读取外部数据

import pandas as pd
from pymongo import MongoClient
df=pd.read_csv('文件路径')
df = df.sort_values(by='对应文件路径下，某一列的标题',ascending=False)  #True是升序，False是降序

2. Dataframe 二维，Series容器 (1). Dataframe的基础属性

import string

import numpy as np
import pandas as pd

t = pd.Dataframe(np.arange(12).reshape((3,4)),index=list(string.ascii_uppercase[:3]),columns=list(string.ascii_uppercase[-4:]))
#string.ascii_uppercase[-4:]   从字母后面取四个标记到相应位置
print(t)
print(t.shape)  #行数列数
print(t.dtypes)   #裂数据类型
print(t.ndim)  #数据维度
print(t.index)  #行索引
print(t.columns)  #列索引
print(t.values)  #对象值，二维ndarray数组

运行结果:
   W  X   Y   Z
A  0  1   2   3
B  4  5   6   7
C  8  9  10  11
(3, 4)
W    int32
X    int32
Y    int32
Z    int32
dtype: object
2
Index(['A', 'B', 'C'], dtype='object')
Index(['W', 'X', 'Y', 'Z'], dtype='object')
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

(2). Dataframe的整体情况查询

import string

import numpy as np
import pandas as pd

t = pd.Dataframe(np.arange(12).reshape((3,4)),index=list(string.ascii_uppercase[:3]),columns=list(string.ascii_uppercase[-4:]))
#string.ascii_uppercase[-4:]   从字母后面取四个标记到相应位置
print(t)
print(t.head(2))   #显示头部几行，默认5行
print(t.tail(2))   #显示末尾几行，默认5行
print(t.info())    #相关信息概览:行数,列数,列索引,列非空值个数,列类型等
print(t.describe())  #快速综合统计结果:计数,均值,标准差等等

运行结果:
W  X   Y   Z
A  0  1   2   3
B  4  5   6   7
C  8  9  10  11
   W  X  Y  Z
A  0  1  2  3
B  4  5  6  7
   W  X   Y   Z
B  4  5   6   7
C  8  9  10  11

Index: 3 entries, A to C
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   W       3 non-null      int32
 1   X       3 non-null      int32
 2   Y       3 non-null      int32
 3   Z       3 non-null      int32
dtypes: int32(4)
memory usage: 72.0+ bytes
None
         W    X     Y     Z
count  3.0  3.0   3.0   3.0
mean   4.0  5.0   6.0   7.0
std    4.0  4.0   4.0   4.0
min    0.0  1.0   2.0   3.0
25%    2.0  3.0   4.0   5.0
50%    4.0  5.0   6.0   7.0
75%    6.0  7.0   8.0   9.0
max    8.0  9.0  10.0  11.0

3. loc与iloc

df.loc 通过标签索引行数据
df.iloc 通过位置获取行数据

loc要写行或者列的名字

import string

import numpy as np
import pandas as pd

t = pd.Dataframe(np.arange(12).reshape((3,4)),index=list(string.ascii_uppercase[:3]),columns=list(string.ascii_uppercase[-4:]))
#string.ascii_uppercase[-4:]   从字母后面取四个标记到相应位置
print(t)
print(t.loc['A','W'])
print(t.loc['A',['W','X']])
print(type(t.loc['A',['W','X']]))
print(t.loc[['A','B'],['W','Z']])
print(t.loc['A':'C',['W','X']])  #特殊这个冒号是完全的闭区间
t.loc['A','W']=10   #改变某一行的数字
print(t)

运行结果:
   W  X   Y   Z
A  0  1   2   3
B  4  5   6   7
C  8  9  10  11
0
W    0
X    1
Name: A, dtype: int32

   W  Z
A  0  3
B  4  7
   W  X
A  0  1
B  4  5
C  8  9
    W  X   Y   Z
A  10  1   2   3
B   4  5   6   7
C   8  9  10  11

iloc写数字

print(t.iloc[1:3,[2,3]])
t.iloc[0,0]=0
print(t)
运行结果:
    Y   Z
B   6   7
C  10  11
   W  X   Y   Z
A  0  1   2   3
B  4  5   6   7
C  8  9  10  11

4. 布尔索引

import string

import numpy as np
import pandas as pd
from pymongo import MongoClient
df = pd.read_csv('./dogNames2.csv')
df = df.sort_values(by='Count_AnimalName',ascending=False)
print(df[(df['Count_AnimalName']>700)&(df['Row_Labels'].str.len()>4)])  #取两列且有限制条件

运行结果:
      Row_Labels  Count_AnimalName
1156       BELLA              1195
2660     CHARLIE               856
12368      ROCKY               823
8552       LUCKY               723

5. 字符串方法

6. 缺失数据的处理

import string

import numpy as np
import pandas as pd

t = pd.Dataframe(np.arange(12).reshape((3,4)),index=list(string.ascii_uppercase[:3]),columns=list(string.ascii_uppercase[-4:]))
#string.ascii_uppercase[-4:]   从字母后面取四个标记到相应位置
print(t)
t.loc['A':'B','W']=np.nan
print(pd.isnull(t))
# t.dropna(axis=0,how='any',inplace=True)
#删除为nan的那些行或者列  axis是选择轴，
# how是决定是整行都是nan的删除还是部分是nan的就删除
# inplace代表原地修改 如果为True就在原数列上做出改动
print(t)
print(t.fillna(t.mean()))   
#用numpy很长的代码替换均值，用pandas一行就解决了 t.fillna()括号中可以直接输入想要替换的数字
print(t['X'].fillna(t['X'].mean()))#只改变其中一行的值

运行结果:
   W  X   Y   Z
A  0  1   2   3
B  4  5   6   7
C  8  9  10  11
    Y   Z
B   6   7
C  10  11
       W      X      Y      Z
A   True  False  False  False
B   True  False  False  False
C  False  False  False  False
     W  X   Y   Z
A  NaN  1   2   3
B  NaN  5   6   7
C  8.0  9  10  11
     W  X   Y   Z
A  8.0  1   2   3
B  8.0  5   6   7
C  8.0  9  10  11
A    1
B    5
C    9
Name: X, dtype: int32

7. 数据合并之join

#coding=utf-8
import numpy as np
import pandas as pd
import string
df1 = pd.Dataframe(np.ones((2,4)),index=['A','B'],columns=list('abcd'))
df2 = pd.Dataframe(np.zeros((3,3)),index=['A','B','C'],columns=list('xyz'))
df3 = pd.Dataframe(np.arange(9).reshape(3,3),columns=list('fax'))
print(df1)
print(df2)
print(df3)
print(df1.join(df2))   #以df1为准 df2只是加入
print(df2.join(df1))   #同理
print(df1.merge(df3,on='a',how='inner'))
#df3相同列对应行上的数字如果与df1中对应该行上的数字相等，则将df3中相同行的数字放在东方df1后面
print(df1.merge(df3,on='a',how='outer'))
print(df1.merge(df3,on='a',how='left'))
print(df1.merge(df3,on='a',how='right'))
#默认how是inner，outer就是并集，给输出的都输出，没有的用nan表示
#left就是以df1为准，right就是以df3为准

运行结果:
     a    b    c    d
A  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0
     x    y    z
A  0.0  0.0  0.0
B  0.0  0.0  0.0
C  0.0  0.0  0.0
   f  a  x
0  0  1  2
1  3  4  5
2  6  7  8
     a    b    c    d    x    y    z
A  1.0  1.0  1.0  1.0  0.0  0.0  0.0
B  1.0  1.0  1.0  1.0  0.0  0.0  0.0
     x    y    z    a    b    c    d
A  0.0  0.0  0.0  1.0  1.0  1.0  1.0
B  0.0  0.0  0.0  1.0  1.0  1.0  1.0
C  0.0  0.0  0.0  NaN  NaN  NaN  NaN
     a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  1.0  1.0  1.0  1.0  0  2
     a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  1.0  1.0  1.0  1.0  0  2
2  4.0  NaN  NaN  NaN  3  5
3  7.0  NaN  NaN  NaN  6  8
     a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  1.0  1.0  1.0  1.0  0  2
     a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  1.0  1.0  1.0  1.0  0  2
2  4.0  NaN  NaN  NaN  3  5
3  7.0  NaN  NaN  NaN  6  8

#显示所有列
pd.set_option('display.max_columns', None)
# #显示所有行
pd.set_option('display.max_rows', None)

问题:现在我们有一组关于全球星巴克店铺的统计数据，如果我想知道美国的星巴克数量和中国的哪个多，或者我想知道中国每个省份星巴克的数量的情况，那么应该怎么办？

'''
现在我们有一组关于全球星巴克店铺的统计数据，如果我想知道美国的星巴克数量和中国的哪个多，
或者我想知道中国每个省份星巴克的数量的情况，那么应该怎么办？
'''
#美国的星巴克数量和中国的哪个多，
import pandas as pd
from matplotlib import pyplot as plt
file_path = './starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
print(df.info())

grouped = df.groupby(by='Country')   #文件根据Country来分组
print(grouped)
# DataframeGroupBy
# 进行遍历
for i,j in grouped:   #因为是Dataframe所以可以i j两个都写入
    print(i)   #国家
    print('*'*100)
    print(j)   #国家对应的全部信息
    print('-'*100)
#调用聚合方法
print(grouped['Brand'].count())  #count统计数量
country_count = grouped['Brand'].count()
print(country_count['US'])
print(grouped['Brand'].count()['CN'])   #两种写法一样

#统计中国每个省份店铺数量
China_data=df[df['Country']=='CN']
group = China_data.groupby(by='State/Province').count()['Brand']
print(group)

#数据按照多个条件进行分组，返回Series
group = df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count()
print(group,type(group))

#数据按照多个条件进行分组,返回DataFrare
group = df[['Brand']].groupby(by=[df['Country'],df['State/Province']]).count()
print(group,type(group))

#索引的方法和属性
print(group.index)

运行结果:
文件在pythonproject04中，结果太长了这儿不写了

8. 分组和聚合

在pandas中类似的分组的操作我们有很简单的方式来完成
df.groupby(by=“columns_name”)
如果我们需要对国家和省份进行分组统计，应该怎么操作呢？

grouped = df.groupby(by=[df[“Country”],df[“State/Province”]])

很多时候我们只希望对获取分组之后的某一部分数据，或者说我们只希望对某几列数据进行分组，这个时候我们应该怎么办呢？

获取分组之后的某一部分数据：

df.groupby(by=[“Country”,“State/Province”])[“Country”].count()

对某几列数据进行分组：

df[“Country”].groupby(by=[df[“Country”],df[“State/Province”]]).count()

上述三个结果一样

9. 索引和复合索引 (1). Series的复合索引

import numpy as np
import pandas as pd
import string

t = pd.Dataframe(np.arange(12).reshape(3,4),index=list('abc'),columns=list('wxyz'))
print(t)
t.index=(['A','B','C'])   #t中有几个行，对应的index就要有几项
print(t.index)
print(t.reindex(['A','F']))
print(t.set_index('w').index)
print(t.set_index('w',drop=False))
#drop为True的话就就不显示列表中的w了,默认为True
print(t['y'].unique())#unique()一个值只返回一次，重复的就不返回了
print(len(t.set_index('w').index.unique()))
print(len(t.set_index('w').index))
print(t.set_index(['w','x','y']))
print(t.set_index(['w','x','y'],drop=False).index)

运行结果:
   w  x   y   z
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
Index(['A', 'B', 'C'], dtype='object')
     w    x    y    z
A  0.0  1.0  2.0  3.0
F  NaN  NaN  NaN  NaN
Int64Index([0, 4, 8], dtype='int64', name='w')
   w  x   y   z
w              
0  0  1   2   3
4  4  5   6   7
8  8  9  10  11
[ 2  6 10]
3
3
         z
w x y     
0 1 2    3
4 5 6    7
8 9 10  11
MultiIndex([(0, 1,  2),
            (4, 5,  6),
            (8, 9, 10)],
           names=['w', 'x', 'y'])

(2). Dataframe的复合索引

import pandas as pd
import string
a = pd.Dataframe({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list('hjklmno')})
print(a)
b = a.set_index(['c','d'])   #将c,d设置成索引
print(b)
c = b['a']   #输出b中有‘a’的一排
print(c)
print(c['two']['m'])    #'two'与'm'对应的值
print(c['two'])    #two对应的全部值
d = a.set_index(['d','c'])['a']   #d,c按顺序，谁在编码时写前面，谁运行时就在前面
print(d)

运行结果:
   a  b    c  d
0  0  7  one  h
1  1  6  one  j
2  2  5  one  k
3  3  4  two  l
4  4  3  two  m
5  5  2  two  n
6  6  1  two  o
       a  b
c   d      
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1
c    d
one  h    0
     j    1
     k    2
two  l    3
     m    4
     n    5
     o    6
Name: a, dtype: int64
4
d
l    3
m    4
n    5
o    6
Name: a, dtype: int64
d  c  
h  one    0
j  one    1
k  one    2
l  two    3
m  two    4
n  two    5
o  two    6
Name: a, dtype: int64

10. 代码练习

'''
使用matplotlib呈现出店铺总数排名前10的国家
'''
import pandas as pd
from matplotlib import pyplot as plt
file_path = './starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
# print(df.info())

data1 = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10]
print(data1)
print(data1.index)
print(data1.values)

#分别设置x轴y轴坐标的内容
_x=data1.index
_y=data1.values

#画图
plt.figure(figsize=(20,8),dpi=80)

plt.bar(range(len(_x)),_y,width=0.3,color='orange')

plt.xticks(range(len(_x)),_x)
plt.show()

运行结果:

'''
使用matplotlib呈现出每个中国每个城市的店铺数量
'''
import pandas as pd
from matplotlib import pyplot as plt
file_path = './starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
df = df[df['Country']=='CN']
data1 = df.groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25]
#sort_values('某一列的索引',ascending=False/True)  将Brand的值按照升序或者降序排列False是降序,True是升序

#分别设置x轴y轴坐标的内容
_x=data1.index
_y=data1.values

#画图
plt.figure(figsize=(20,12),dpi=80)

plt.barh(range(len(_x)),_y,height=0.3,color='orange')

plt.yticks(range(len(_x)),_x)
plt.show()

运行结果:

'''
现在我们有全球排名靠前的10000本书的数据，那么请统计一下下面几个问题：
1.不同年份书的数量
'''
import pandas as pd
from matplotlib import pyplot as plt

file_path = './books.csv'
df = pd.read_csv(file_path)
pd.set_option('display.max_columns',None)

print(df.info())

data1 = df[pd.notnull(df['original_publication_year'])]  #给不是nan的拿出来 因为info后发现年份不是10000
grouped =data1.groupby(by='original_publication_year').count()['title']
print(grouped)

运行结果:
-1750.0      1
-762.0       1
-750.0       2
-720.0       1
-560.0       1
          ... 
 2013.0    518
 2014.0    437
 2015.0    306
 2016.0    198
 2017.0     11

'''
2.不同年份书的平均评分情况
'''
import pandas as pd
from matplotlib import pyplot as plt

file_path = './books.csv'
df = pd.read_csv(file_path)
pd.set_option('display.max_columns',None)

# print(df.info())

#不同年份平均评分情况
data1=df[pd.notnull(df['original_publication_year'])]  #取年份中没有nan的值
grouped = data1['average_rating'].groupby(by=data1['original_publication_year']).mean()

print(grouped)

_x=grouped.index
_y=grouped.values

#画图
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)   #折线图

plt.xticks(list(range(len(_x)))[::10],_x[::10].astype(int),rotation=45)
plt.show()

运行结果:

python

Python相关栏目本月热门文章