dataframe对象的方法
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
kobe_df = pd.read_csv('*.csv', index_col='shot_id')
# 获取Dataframe的相关信息
kobe_df.info()
# 设置不限制最大显示的列数
pd.set_option('max_columns', None)
# 显示前15行
kobe_df.head(15)
# 显示后3行
kobe_df.tail(3)
import pymysql
conn = pymysql.connect(host='127.0.0.1', port=3306,
user='$', password='$',
database='$', charset='utf8mb4')
conn
emp_df = pd.read_sql('select * from tb_emp', conn, index_col='eno')
emp_df.info()
# 判断空值
emp_df.isnull()
emp_df.isna()
# 判断非空值
emp_df.notnull()
emp_df.notna()
# 计算每一个列空值的数量
emp_df.isnull().sum()
# 删除空值(默认沿着0轴删除)
emp_df.dropna()
# 用插值法填充空值
emp_df.comm.interpolate()
# 去重
dept_df.duplicated('列名')
# 实例
heights = np.ceil(np.random.normal(110, 5, 50))
heights[-1] = 195
heights[0] = 80
plt.boxplot(heights, whis=3)
plt.show()
def detect_outliers_iqr(data, whis=1.5):
q1, q3 = np.quantile(data, [0.25, 0.75])
iqr = q3 - q1
lower, upper = q1 - whis * iqr, q3 + whis * iqr
return data[(data < lower) | (data > upper)]
detect_outliers_iqr(heights, whis=3)
def detect_outliers_zscore(data, threshold=3):
avg_value = np.mean(data)
std_value = np.std(data)
z_score = np.abs((data - avg_value) / std_value)
return data[z_score > threshold]
detect_outliers_zscore(heights)
new_heights = pd.Series(heights).replace([80, 195], 110)
new_heights.plot(kind='box')
plt.boxplot(heights[1:-1])
plt.show()