此篇文章将我最近所学的线性回归模型给放在了上面,有需要学习的同学可以参考。
代码使用Pycham编写,
代码超级详细,注释仅为个人理解,如果有不对的地方还请指出。
想要一起学习的可以加好友本人QQ:920133676,并免费提供完整学习视频和数据集。
代码保持持续更新。
"""
-*-Code-*-
作者:LIANGQISE
日期:2021年10月06日
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 读入信用卡实例数据
data = pd.read_csv('creditcard.csv')
# 查看y也就是好坏样本的个数分别为多少,并绘制图像
count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind = 'bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
# 对数据进行标准化操作
from sklearn.preprocessing import StandardScaler
data['narmAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1)) # 对数据进行标准化操作,并将其加入到data中
data = data.drop(['Time','Amount'], axis = 1)
### 使用下采样方式
X = data.loc[:,data.columns != 'Class'] # 构造特征数据,X为不包括Class的列
# print(X)
y = data.loc[:, data.columns == 'Class'] # 构造特征数据,y为只包括Class的列,当作标签(label)值
number_records_fraud = len(data[data.Class == 1]) # 计算Class中恒等于1的列有多少,返回为TRUE或者FASE
# print(number_records_fraud)
fraud_indices = np.array(data[data.Class == 1].index) # 取出Class中恒为1的列的索引
# print(fraud_indices)
normal_indices = np.array(data[data.Class == 0].index) # 取出Class中恒为0的列的索引
# 通过索引来进行随机选择X,长度为number_records_fraud(恒为1的列)
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
# print(random_normal_indices)
# random_normal_indices = np.array(random_normal_indices)
# print(random_normal_indices)
# 将index值恒为1的列和刚随机选择的X进行连接
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
# 在以前的样本中取出index值为under_sample_indices的样本
under_sample_data = data.iloc[under_sample_indices,:]
# print(under_sample_data)
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class']
### 使用交叉验证的方法
# 切分数据集,无规律切分
from sklearn.model_selection import train_test_split
# 对所有的数据进行切分,切分比例为7:3
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,train_size=0.7,random_state=0)
# print('Number transactions train dataset:',len(X_train))
# print('Number transactions test dataset:',len(X_test))
# print('Total number of transactions',len(X_train) + len(X_test))
# 对下采样数据进行切分,切分比例为7:3
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(
X_undersample,y_undersample,test_size=0.3,train_size=0.7,random_state=0)
### 建模,逻辑回归,sklearn里面的库
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
### lr中通过kfold寻找最优C
def printing_Kfold_scores(x_train_data, y_train_data):
# 交叉验证将原始数据集切成5部分
fold = KFold(5,shuffle=False)
# 在逻辑回归中需要指定参数,这次指定正则化惩罚项,惩罚项的参数
c_param_range = [0.01,0.1,1,10,100]
# 可视化显示
results_table = pd.Dataframe(index=range(len(c_param_range),2), columns=['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
j = 0
for c_param in c_param_range: # 循环每一个C值
print('---------------------------')
print('C parameter', c_param)
print('---------------------------')
print('')
recall_accs = []
for iteration, indices in enumerate(fold.split(y_train_data),start=1): #循环进行交叉验证
# 建立逻辑回归模型
# 用l1惩罚,惩罚参数为c_param
lr = LogisticRegression(C = c_param,penalty= 'l1',solver='liblinear')
# 训练
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
# 预测
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:])
# 计算召回率,并将其放到集合里面,便于展示
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values.ravel(),y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration',iteration,'recall score = ',recall_acc)
# 打印值
results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score', np.mean(recall_accs))
print('')
best_c = results_table.iloc[results_table['Mean recall score'].astype(float).idxmax()]['C_parameter']
print('***********************************************************************')
print('Best model to choose from cross validation is with C parameter = ',best_c)
print('***********************************************************************')
return best_c
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)
print(best_c)



