此篇文章将我最近所学的线性回归模型给放在了上面,有需要学习的同学可以参考。
代码使用Pycham编写,
代码超级详细,注释仅为个人理解,如果有不对的地方还请指出。
想要一起学习的可以加好友本人QQ:920133676,并免费提供完整学习视频和数据集。
代码保持持续更新
"""
-*-Code-*-
作者:LIANGQISE
日期:2021年10月08日
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
### 读入数据
data = pd.read_csv('creditcard.csv')
# print(data.head())
### 对Amount数据进行标准化操作命名为narmAmount,并删除Time列和Amount
from sklearn.preprocessing import StandardScaler
data['narmAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
# print(data.head())
data = data.drop(['Time','Amount'],axis=1) #删除无用的列
# print(data.head())
### 构建数据集
# X为不包括Class的项
X = data.loc[:,data.columns != 'Class']
# print(data.columns)
y = data.loc[:,data.columns == 'Class']
# 计算Class中恒等于1的,返回值为个数
number_records_fraud = len(data[data.Class == 1])
# print(fraud_indices)
# 将Class中恒为1的拿出来
fraud_index = np.array(data[data.Class == 1].index)
# print(fraud_index.shape)
normal_index = np.array(data[data.Class == 0].index)
# print(normal_index.shape)
# 通过索引来随机选择X,长度为number_records_fraud(Class中恒为1的列),返回为索引值
random_normal_indices = np.random.choice(normal_index,number_records_fraud,replace=False)
# print(random_normal_indices)
# 将其变为数组格式,不变也行,安全起见
random_normal_indices = np.array(random_normal_indices)
# print(random_normal_indices)
# 将刚刚选择的恒为1的X和y进行拼接,并得到其索引值
under_sample_indices = np.concatenate([fraud_index,random_normal_indices])
# print(under_sample_indices.shape)
under_sample_data = data.iloc[under_sample_indices,:]
# print(under_sample_data)
X_undersample = under_sample_data.loc[:,under_sample_data.columns != 'Class']
# print(X_undersample)
y_undersample = under_sample_data.loc[:,under_sample_data.columns == 'Class']
# 切分所有数据集,按照0.7:0.3
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,train_size=0.7,random_state=0)
# print(X_train.head())
# 切分划好的数据集,同样按照0.7:0.3
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(
X_undersample,y_undersample,test_size=0.3,train_size=0.7,random_state=0)
# print(X_test_undersample)
# print(y_test_undersample.values.ravel())
# print(X_train_undersample.shape)
### 绘制混淆矩阵函数
import itertools
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
绘制混淆矩阵
"""
plt.title(title)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.colorbar()
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# ### 逻辑回归阈值对结果的影响
from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score,confusion_matrix
# 定义逻辑回归函数
lr = LogisticRegression(C = 0.01,penalty='l1',solver='liblinear')
# 训练
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
# 预测概率值
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)
# 自带函数绘制混淆矩阵
# plot_confusion_matrix(lr,X_test_undersample,y_test_undersample)
# plt.show()
#
# 定义阈值
thresholed = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
# 定义图像大小
plt.figure(figsize=(10,10))
j = 1
for i in thresholed:
y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
plt.subplot(3,3,j)
j += 1
# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
# 设置打印数组的格式,precision=2为小数点后两位
np.set_printoptions(precision=2)
print('Recall metric in the testing dataset:',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
#画图
class_name = [0,1]
plot_confusion_matrix(cnf_matrix
, classes=class_name
, title='Threshold >= %s'%i)
plt.show()



