数据分析第六天

dataframe对象的方法
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

%config InlineBackend.figure_format = 'svg'

kobe_df = pd.read_csv('*.csv', index_col='shot_id')
# 获取Dataframe的相关信息
kobe_df.info()
# 设置不限制最大显示的列数
pd.set_option('max_columns', None)
# 显示前15行
kobe_df.head(15)
# 显示后3行
kobe_df.tail(3)

import pymysql

conn = pymysql.connect(host='127.0.0.1', port=3306,
                       user='$', password='$',
                       database='$', charset='utf8mb4')
conn

emp_df = pd.read_sql('select * from tb_emp', conn, index_col='eno')
emp_df.info()

# 判断空值
emp_df.isnull()
emp_df.isna()
# 判断非空值
emp_df.notnull()
emp_df.notna()
# 计算每一个列空值的数量
emp_df.isnull().sum()
# 删除空值（默认沿着0轴删除）
emp_df.dropna()
# 用插值法填充空值
emp_df.comm.interpolate()
# 去重
dept_df.duplicated('列名')

# 实例

heights = np.ceil(np.random.normal(110, 5, 50))
heights[-1] = 195
heights[0] = 80
plt.boxplot(heights, whis=3)
plt.show()


def detect_outliers_iqr(data, whis=1.5):
    q1, q3 = np.quantile(data, [0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - whis * iqr, q3 + whis * iqr
    return data[(data < lower) | (data > upper)]

detect_outliers_iqr(heights, whis=3)


def detect_outliers_zscore(data, threshold=3):
    avg_value = np.mean(data)
    std_value = np.std(data)
    z_score = np.abs((data - avg_value) / std_value)
    return data[z_score > threshold]

detect_outliers_zscore(heights)

new_heights = pd.Series(heights).replace([80, 195], 110)
new_heights.plot(kind='box')

plt.boxplot(heights[1:-1])
plt.show()
数据分析第六天

Python相关栏目本月热门文章