阈值对逻辑回归的影响，以及混淆矩阵的制作-信用卡数据预测

此篇文章将我最近所学的线性回归模型给放在了上面，有需要学习的同学可以参考。

代码使用Pycham编写，

代码超级详细，注释仅为个人理解，如果有不对的地方还请指出。

想要一起学习的可以加好友本人QQ：920133676，并免费提供完整学习视频和数据集。

代码保持持续更新

"""
-*-Code-*-
作者：LIANGQISE
日期：2021年10月08日
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 读入数据
data = pd.read_csv('creditcard.csv')
# print(data.head())

### 对Amount数据进行标准化操作命名为narmAmount，并删除Time列和Amount
from sklearn.preprocessing import StandardScaler
data['narmAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
# print(data.head())
data = data.drop(['Time','Amount'],axis=1)   #删除无用的列
# print(data.head())

### 构建数据集
# X为不包括Class的项
X = data.loc[:,data.columns != 'Class']
# print(data.columns)
y = data.loc[:,data.columns == 'Class']
# 计算Class中恒等于1的，返回值为个数
number_records_fraud = len(data[data.Class == 1])
# print(fraud_indices)
# 将Class中恒为1的拿出来
fraud_index = np.array(data[data.Class == 1].index)
# print(fraud_index.shape)
normal_index = np.array(data[data.Class == 0].index)
# print(normal_index.shape)
# 通过索引来随机选择X，长度为number_records_fraud（Class中恒为1的列）,返回为索引值
random_normal_indices = np.random.choice(normal_index,number_records_fraud,replace=False)
# print(random_normal_indices)
# 将其变为数组格式，不变也行，安全起见
random_normal_indices = np.array(random_normal_indices)
# print(random_normal_indices)
# 将刚刚选择的恒为1的X和y进行拼接,并得到其索引值
under_sample_indices = np.concatenate([fraud_index,random_normal_indices])
# print(under_sample_indices.shape)
under_sample_data = data.iloc[under_sample_indices,:]
# print(under_sample_data)
X_undersample = under_sample_data.loc[:,under_sample_data.columns != 'Class']
# print(X_undersample)
y_undersample = under_sample_data.loc[:,under_sample_data.columns == 'Class']

# 切分所有数据集，按照0.7：0.3
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,train_size=0.7,random_state=0)
# print(X_train.head())
# 切分划好的数据集，同样按照0.7：0.3
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(
    X_undersample,y_undersample,test_size=0.3,train_size=0.7,random_state=0)
# print(X_test_undersample)
# print(y_test_undersample.values.ravel())
# print(X_train_undersample.shape)
### 绘制混淆矩阵函数
import itertools
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    绘制混淆矩阵
    """
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# ### 逻辑回归阈值对结果的影响
from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score,confusion_matrix
# 定义逻辑回归函数
lr = LogisticRegression(C = 0.01,penalty='l1',solver='liblinear')
#   训练
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
#   预测概率值
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)
# 自带函数绘制混淆矩阵
# plot_confusion_matrix(lr,X_test_undersample,y_test_undersample)
# plt.show()
#
# 定义阈值
thresholed = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

# 定义图像大小
plt.figure(figsize=(10,10))

j = 1
for i in thresholed:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
    plt.subplot(3,3,j)
    j += 1
    # 计算混淆矩阵
    cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
    # 设置打印数组的格式，precision=2为小数点后两位
    np.set_printoptions(precision=2)

    print('Recall metric in the testing dataset:',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
    #画图
    class_name = [0,1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_name
                          , title='Threshold >= %s'%i)
plt.show()

阈值对逻辑回归的影响，以及混淆矩阵的制作-信用卡数据预测

Python相关栏目本月热门文章