[Python数据分析] pandas库基础

pandas库是一个基于numpy、专门为数据分析而设计的库,该库不仅提供了大量模块及一些标准的数据模型,而且提供了高效操作数据集的数据结构,被广泛地应用到众多领域中.

1.Series

# Series类对象的结构类似一维数组,主要由数据和索引组成,数据可以是任意类型
# pandas.Series(data(传入数据)=None, index(传入索引)=None, dtype(数据类型)=None, name(Series类对象名称)=None, copy(复制数据)=False)

import pandas as pd

# 根据列表创建Series类对象
ser_obj = pd.Series(['Python', 'Java', 'PHP'])
print(ser_obj)
'''输出结果
0    Python
1      Java
2       PHP
dtype: object
'''

# 创建Series类对象,指定索引
ser_obj_01 = pd.Series(['Python', 'Java', 'PHP'], index=['one', 'two', 'three'])
print(ser_obj_01)
'''输出结果
one      Python
two        Java
three       PHP
dtype: object
'''

data = {'one': 'Python', 'two': 'Java', 'three': 'PHP'}
ser_obj_02 = pd.Series(data)  # 根据字典创建Series类对象
'''输出结果
one      Python
two        Java
three       PHP
dtype: object
'''

# 时间序列(动态序列)是指将同一统计指标的数值按其发生的时间先后顺序排列成的数列
# Series类对象或Dataframe类对象可以指定索引为时间索引,生成一个时间序列,代码如下:

import datetime

# 创建时间索引
date_index = pd.to_datetime(['20210820', '20210828', '20210908'])
print(
    date_index)
'''输出结果
print(ser_obj_02)  # DatetimeIndex(['2021-08-20', '2021-08-28', '2021-09-08'], dtype='datetime64[ns]', freq=None)
'''
# 创建Series类对象,指定索引为时间索引
date_ser = pd.Series([11, 22, 33], index=date_index)
print(date_ser)
'''输出结果
2021-08-20    11
2021-08-28    22
2021-09-08    33
dtype: int64
'''

2.Dateframe

# Dateframe类对象结构类似二维数组或表格,Dateframe也由索引和数据组成,但该对象有行索引和列索引
# pd.Dateframe(data=None, index=None, columns=None, dtype=None, copy=Flase)

import pandas as pd
import numpy as np

# 创建二维数组
demo_arr = np.array([['a', 'b', 'c'], ['d', 'e', 'f']])
df_obj = pd.Dataframe(demo_arr)  # 根据二维数组创建Dateframe类对象
print(df_obj)
'''输出结果
   0  1  2
0  a  b  c
1  d  e  f
'''

# 创建Dateframe类对象,同时指定行索引和列索引
df_obj_01 = pd.Dataframe(demo_arr, index=['row_01', 'row_02'], columns=['col_01', 'col_02', 'col_03'])
print(df_obj_01)
'''输出结果
       col_01 col_02 col_03
row_01      a      b      c
row_02      d      e      f
'''

# from_tuples(),根据元组创建分层索引
# from_arrays(),根据数组创建分层索引
# from_product(),从集合的笛卡尔积中创建分层索引
# from_frame(),根据Dataframe类对象创建分层索引

# 以from_tuples()方法为例,创建双层索引的Dataframe类对象
tuple_clo = [('a', 0), ('a', 1), ('b', 2), ('b', 2)]
tuple_row = [('a', 0), ('a', 1), ('b', 2), ('b', 2)]
multi_index_col = pd.MultiIndex.from_tuples(tuples=tuple_clo)
multi_index_row = pd.MultiIndex.from_tuples(tuples=tuple_row)
data = [['A', 'B', 'C', 'D'], ['E', 'F', 'G', 'H'], ['I', 'J', 'K', 'L'], ['M', 'N', 'O', 'P']]
df = pd.Dataframe(data, index=multi_index_col, columns=multi_index_row)
print(df)
'''输出结果
     a     b   
     0  1  2  2
a 0  A  B  C  D
  1  E  F  G  H
b 2  I  J  K  L
  2  M  N  O  P
'''

3.使用单层索引访问数据

import pandas as pd

'''使用单层索引访问数据'''
# ①使用[]访问数据
# 访问格式   变量 [索引]
ser = pd.Series(['A', 'B', 'C', 'D'],
                index=['one', 'two', 'three', 'four'])  # 创建Series类对象
print(ser)
'''输出结果
one      A
two      B
three    C
four     D
dtype: object
'''
# 访问索引为'one'的数据
print(ser['one'])
'''输出结果
A
'''

df = pd.Dataframe([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
                  index=[4, 5, 6], columns=['A', 'B', 'C'])  # 创建Dataframe类对象
print(df)
'''输出结果
    A   B   C
4   0   2   3
5   0   4   1
6  10  20  30
'''
# 访问列索引为'A'的数据
print(df['A'])
'''输出结果
4     0
5     0
6    10
Name: A, dtype: int64 10
'''

# ②使用loc和iloc访问数据
# 使用格式
# 变量.loc[索引]  索引必须为自定义的标签索引
# 变量.iloc[索引]  索引必须为自动生成的整数索引
# Dateframe类对象使用以上两个方法访问数据时会将索引视为行索引,获取该索引对应的一行数据
print(ser.loc['two'])  # 访问标签索引为'two'的数据
print(ser.iloc[2])  # 访问整数索引为2的数据
'''输出结果
B
C
'''
print(df.loc[4])  # 访问标签索引为4的数据
'''输出结果
A    0
B    2
C    3
Name: 4, dtype: int64
'''
print(df.iloc[1])  # 访问整数索引为1的数据
'''输出结果
A    0
B    4
C    1
Name: 5, dtype: int64
'''

# ③使用at和iat访问数据
# 这种方式可以访问Dateframe类对象中的单个数据, 格式如下:
# 变量.at[行索引, 列索引]  索引必须为自定义的标签索引
# 变量.iat[行索引, 列索引]  索引必须为自动生成的整数索引
print(df.at[5, 'B'])  # 访问行标签索引为5, 列标签索引为'B'的数据
print(df.iat[1, 1])  # 访问行整数索引为1, 列整数索引为1的数据
'''输出结果
4
4
'''

4.使用分层索引访问数据

import pandas as pd
import numpy as np

'''使用分层索引访问数据'''
# ①使用[]访问数据
# 访问格式
# 变量 [第一层索引]    可以访问第一层索引嵌套的第二层索引及其对应的数据
# 变量 [第一层索引][第二层索引]    可以访问第二层索引对应的数据
mult_series = pd.Series([95, 103, 80, 80, 90, 91, 91],
                        index=[['计算机专业', '计算机专业', '计算机专业', '计算机专业',
                                '体育专业', '体育专业', '体育专业'],
                               ['物联网专业', '软件工程', '网络安全', '信息安全',
                                '体育专业', '休闲体育', '运动康复']])
print(mult_series)
'''输出结果
计算机专业  物联网专业     95
          软件工程     103
          网络安全      80
          信息安全      80
体育专业   体育专业      90
          休闲体育      91
          运动康复      91
dtype: int64
'''
# 访问第一层索引'计算机专业'的数据
print(mult_series['计算机专业'])
'''输出结果
物联网专业     95
软件工程     103
网络安全      80
信息安全      80
dtype: int64
'''
# 访问第二层索引'软件工程'的数据
print(mult_series['计算机专业']['软件工程'])
'''输出结果
103
'''

arrays = ['a', 'a', 'b', 'b'], [1, 2, 1, 2]
frame = pd.Dataframe(np.arange(12).reshape((4, 3)),
                     index=pd.MultiIndex.from_arrays(arrays),
                     columns=[['A', 'A', 'b'],
                              ['Green', 'Red', 'Green']])
print(frame)
'''输出结果
        A         b
    Green Red Green
a 1     0   1     2
  2     3   4     5
b 1     6   7     8
  2     9  10    11
'''
print(frame['A'])  # 访问第一层索引为'A'的数据
'''输出结果
     Green  Red
a 1      0    1
  2      3    4
b 1      6    7
  2      9   10
'''
print(frame['A']['Green'])  # 访问'A'嵌套的索引为'Green'的数据
'''输出结果
a  1    0
   2    3
b  1    6
   2    9
Name: Green, dtype: int32
'''

# ②使用loc和iloc访问数据
# 使用格式
# 变量.loc[第一层索引]  访问第一层索引对应的数据
# 变量.loc[第一层索引][第二层索引]  访问第二层索引对应的数据
# 变量.iloc[整数索引]  访问整数索引对应的数据
print(frame.loc['a'])  # 访问第一层列索引'a'嵌套的索引及数据
'''输出结果
      A         b
  Green Red Green
1     0   1     2
2     3   4     5
'''
print(frame.loc['a', 'A'])  # 访问第二层列索引'A'对应的数据
'''输出结果
   Green  Red
1      0    1
2      3    4
'''
print(frame.iloc[2])  # 访问整数索引为2的数据
'''输出结果
A  Green    6
   Red      7
b  Green    8
Name: (b, 1), dtype: int32
'''

5.重新索引

import pandas as pd

# reindex()方法格式 reindex(labels=None, index=None, columns=None, axis=None,
#         method=None, copy=True, level=None, fill_value=nan, limit=None,
#         tolerance=None)

index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
df = pd.Dataframe({'http_status': [200, 200, 404, 404, 301],
                   'responese_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
                  index=index)
print(df)
'''输出结果
           http_status  responese_time
Firefox            200            0.04
Chrome             200            0.02
Safari             404            0.07
IE10               404            0.08
Konqueror          301            1.00
'''
# 重新索引
new_index = ['Safari', 'Icewease1', 'Comodo Dragon', 'IE10', 'Chrome']
new_df = df.reindex(new_index)
print(new_df)
'''输出结果
               http_status  responese_time
Safari               404.0            0.07
Icewease1              NaN             NaN
Comodo Dragon          NaN             NaN
IE10                 404.0            0.08
Chrome               200.0            0.02
'''

# 由上可以看出, 有的索引有原数据, 有的并没有,此时可以使用指定值对缺失值进行填充
# 通过fill_value参数,使用指定值对缺失值进行研究
new_df = df.reindex(new_index, fill_value='missing')
print(new_df)
'''输出结果
              http_status responese_time
Safari                404           0.07
Icewease1         missing        missing
Comodo Dragon     missing        missing
IE10                  404           0.08
Chrome                200           0.02
'''

# reindex()方法不仅可以对行索引重新索引,还可以对列索引进行重新设置
col_df = df.reindex(columns=['http_status', 'user_agent'])
print(col_df)
'''输出结果
           http_status  user_agent
Firefox            200         NaN
Chrome             200         NaN
Safari             404         NaN
IE10               404         NaN
Konqueror          301         NaN
'''

6.数据排序

# 按索引排序

# sort_index(axis=0, level=None, ascending=True, inplace=False,
#            kind='quicksort', na_position='last', sort_remaining=True,
#            ignore_index: bool = False)
# axis: 轴编号(排序的方向), 0表示按行排序, 1表示按列排序
# level: 表示按哪个索引层级排序
# ascending: 表示是否以升序方式排列
# kind: 表示排序算法
import numpy as np
import pandas as pd

# 创建一个Dateframe类对象
df = pd.Dataframe(np.arange(9).reshape((3, 3)),
                  columns=['c', 'a', 'b'], index=['B', 'C', 'A'])
print(df)
'''输出结果
   c  a  b
B  0  1  2
C  3  4  5
A  6  7  8
'''
row_sort = df.sort_index()  # df对象的行索引按小到大排序升序
print(row_sort)
'''输出结果
   c  a  b
A  6  7  8
B  0  1  2
C  3  4  5
'''
col_sort = df.sort_index(axis=1)  # df对象的列索引按小到大排序升序
print(col_sort)
'''输出结果
   a  b  c
B  1  2  0
C  4  5  3
A  7  8  6
'''

# 按值排序

# sort_values(by, axis=0, ascending=True, inplace=False,
#             kind='quicksort', na_position, ignore=False)
# by: 指定列索引名或行索引名进行排序
# na_position: 表示缺失值显示位置,可以取first首位或last末位

# 创建一个包含缺失值的Dateframe类对象
df = pd.Dataframe({'col_A': [1, 1, 4, 6],
                   'col_B': [4, np.nan, 4, 2],
                   'col_C': [6, 3, 8, 0]})
print(df)
'''输出结果
   col_A  col_B  col_C
0      1    4.0      6
1      1    NaN      3
2      4    4.0      8
3      6    2.0      0
'''
new_df = df.sort_values(by='col_B')  # 根据列索引'col_B'的值进行排序
print(new_df)
'''输出结果
   col_A  col_B  col_C
3      6    2.0      0
0      1    4.0      6
2      4    4.0      8
1      1    NaN      3
'''
print(df.sort_values(by='col_B', na_position='first'))  # 使缺失值放到首位
'''输出结果
   col_A  col_B  col_C
1      1    NaN      3
3      6    2.0      0
0      1    4.0      6
2      4    4.0      8
'''

7.统计计算与统计描述

# 统计计算
# 统计计算主要是对一组数据应用一些统计方法,主要有和,平均值,最大小值,方差等
# sum() 计算和
# mean() 计算平均值
# max(), min() 计算最大小值
# idxmax(), idxmin() 计算索引最大小值
# count() 计算非NaN值的个数
# var() 计算样本方差
# std() 计算样本标准差
# cumsum(), cumprod() 计算样本值的累计和或累计积

import pandas as np
import numpy as np
import pandas as pd

df = pd.Dataframe({'col_A': [2, 34, 25, 4],
                   'col_B': [0, 3, 45, 9],
                   'col_C': [7, 5, 5, 3]},
                  index=['A', 'B', 'C', 'D'])
print(df)
'''输出结果
   col_A  col_B  col_C
A      2      0      7
B     34      3      5
C     25     45      5
D      4      9      3
'''
print(df.max())  # 获取每列的最大值
'''输出结果
col_A    34
col_B    45
col_C     7
dtype: int64
'''
print(df.idxmax())  # 获取每列最大值对应的行索引
'''输出结果
col_A    B
col_B    C
col_C    A
dtype: object
'''

# 统计描述
# 使用describe()方法实现获取多个统计量
# describe(percentiles=None, include=None, exclude=None)
# percentiles: 表示结果包含百分数
# include: 表示结果中包含数据类型的白名单
# exclude: 表示结果中忽略数据类型的黑名单

df_obj = pd.Dataframe({'object': ['a', 'b', 'c', 'c'],
                       'number': [-1, 7, 50, 36],
                       'category': pd.Categorical(['apple',
                                                   'banana', 'orange', 'peach'])})
print(df_obj)
'''输出结果
  object  number category
0      a      -1    apple
1      b       7   banana
2      c      50   orange
3      c      36    peach
'''
print(df_obj.describe())  # 查看统计描述
'''输出结果
          number
count   4.000000
mean   23.000000
std    24.013885
min    -1.000000
25%     5.000000
50%    21.500000
75%    39.500000
max    50.000000
'''
# 总个数4.000000, 平均值23.000000, 方差24.013885, 最小值-1.000000, 25%
# 分位数为5.000000, 50%分位数为21.500000, 75%分位数为39.5000000, 最大值为50.000000

8.绘制图表

# plot(x=None, y=None, kind='line', ax=None, subplots=False,
# sharex=None, sharey=False, layout=None, figsize=None,
# use_index=True, title=None, grid=None, legend=True, style=None,
# logx=False, logy=False, loglog=False,
# zlabel=None, ylabel=None, xlim=None, ylim=None, rot=None,
# xerr=None,secondary_y=False, sort_columns=False, **kwargs)

# x,y:表示x轴和y轴的数据
# kind :表示绘图的类型，该参教的取值可以为ine'(折线图,默认),
# 'bar'(柱形图人'barh'(条形图)、'hist'(直方图)、'box'(箱形图）、
# 'kde'(密度图）、'pie'(饼图）等
# figsize :表示图表尺寸的大小（单位为像素)该参数接收一个元组类型的数据,
# 元组中需包含两个元素，这两个元素分别代表图表的宽度和高度
# title:表示图表的标题
# grid:表示是否显示网格线，若值为True，则显示网格线
# xlabel:表示x轴的标签
# ylabel :表示y轴的标签
# rot :表示轴标签旋转的角度

import pandas as pd

df = pd.Dataframe({'商品A': [2, 34, 25, 4],
                   '商品B': [1, 3, 45, 9],
                   '商品C': [7, 5, 5, 3]},
                  index=['第一季度', '第二季度', '第三季度', '第四季度'])
print(df)
'''输出结果
      商品A  商品B  商品C
第一季度    2    1    7
第二季度   34    3    5
第三季度   25   45    5
第四季度    4    9    3
'''
import matplotlib.pyplot as plt  # 由于pandas的pot方法()不支持中文,借助matplotlib库显示中文字体

plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置显示中文字体
df.plot(kind='bar', xlabel='季度', ylabel='销售额(万元)', rot=0)
plt.show()

df.plot(kind='box', ylabel='销售额(万元)')
plt.show()

输出结果图:

以上讲解了pandas库的基础知识,包括数据结构、索引操作、数据排序、统计计算与统计描述及绘制图表等.

[Python数据分析] pandas库基础

Python相关栏目本月热门文章