栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

7.集成学习

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

7.集成学习

什么是集成学习

机器学习的两个核心任务

集成学习中boosting和Bagging

Baggin 集成原理

实现流程




随机森林构造过程


面试题

包外估计(Out-of-Bag Estimate)

定义


用途

随机森林API



bagging集成优点

随机森林案例(以泰坦尼克号乘客生存预测为例)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor,export_graphviz
# 获取数据
titan = pd.read_csv("titanic.csv")
# 数据基本处理
# 确定特征值,目标值
x = titan[["pclass","age","sex"]]
y = titan["survived"]
# 缺失值处理
x["age"].fillna(value=titan["age"].mean(),inplace=True)
# 数据集划分
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=22,test_size=0.2)
# 特征工程-字典特征抽取
x_train = x_train.to_dict(orient="records")
x_test = x_test.to_dict(orient="records")
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)
# 机器学习-决策树
estimator = DecisionTreeRegressor(max_depth=5)
estimator.fit(x_train,y_train)
# 模型评估
print("得分:n",estimator.score(x_test,y_test))

rf = RandomForestClassifier()
# 通过超参数调优
param = {"n_estimators":[100,120,300],"max_depth":[3,7,11]}
gc = GridSearchCV(rf,param_grid=param,cv=3)
gc.fit(x_train,y_train)
print("随机森林预测结果是:n",gc.score(x_test,y_test))

otto案例-Otto Group Product


链接

数据集介绍

评分标准

导入依赖
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import oneHotEncoder
数据获取
data = pd.read_csv("train.csv")
data.head()

数据基本处理
## 数据类别不均衡
sns.countplot(data.target)
plt.show()

# 随机欠采样获取数据
## 确定特征值,目标值
y = data["target"]
x = data.drop(["id","target"],axis=1)
x.head(),y.head()

## 欠采样获取数据
rus = RandomUnderSampler(random_state=0)
X_resampled,Y_resampled = rus.fit_resample(x,y)
sns.countplot(Y_resampled)
plt.show()

# 把标签值转换为数字
le = LabelEncoder()
Y_resampled = le.fit_transform(Y_resampled)
# 分割数据
x_train,x_test,y_train,y_test = train_test_split(X_resampled,Y_resampled,test_size=0.2,random_state=22)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
模型训练
## 开启包外估计
rf = RandomForestClassifier(oob_score=True)
rf.fit(x_train,y_train)
y_pre = rf.predict(x_test)
score = rf.score(x_test,y_test)
rf.oob_score_ #0.7587845622119815
score1 #0.7840483731644111
评分
# logloss 参数要求one-hot格式
one_hot = OneHotEncoder(sparse=False)
y_test1 = one_hot.fit_transform(y_test.reshape(-1,1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1,1))


log_loss(y_test1,y_pre1,eps=1e-15,normalize=True)

7.4587049513916055

改变预测值输出模式,让输出结果为百分占比,减低logloss值

y_pre_probae = rf.predict_proba(x_test)
y_pre_probae

rf.oob_score_ #0.7587845622119815
log_loss(y_test1,y_pre_probae,eps=1e-15,normalize=True)

模型调优 确定最优的n_estimators
tuned_parameters = range(10,200,10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(
        n_estimators=one_parameter,
        max_depth=10,
        max_features=10,
        min_samples_leaf=10,
        oob_score=True,
        n_jobs=-1)
    rf2.fit(x_train,y_train)
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    # 输出log_loss
    y_pre_proba = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)

axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("n_estimators")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()


由图像可知,确定n_estimators=170的时候,表现效果不错

确定最优的max_features
tuned_parameters = range(5,40,5)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(
        n_estimators=170,
        max_depth=10,
        max_features=one_parameter,
        min_samples_leaf=10,
        oob_score=True,
        n_jobs=-1)
    rf2.fit(x_train,y_train)
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    # 输出log_loss
    y_pre_proba = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)

axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("max_features")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()


由图像可知,确定max_features=15的时候,表现效果不错

确定最优的max_depth
tuned_parameters = range(10,100,10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(
        n_estimators=170,
        max_depth=one_parameter,
        max_features=15,
        min_samples_leaf=10,
        oob_score=True,
        n_jobs=-1)
    rf2.fit(x_train,y_train)
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    # 输出log_loss
    y_pre_proba = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)

axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("max_depth")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()


由图像可知,确定max_depth=30的时候,表现效果不错

确定最优的min_samples_leaf
tuned_parameters = range(1,10,2)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(
        n_estimators=170,
        max_depth=30,
        max_features=15,
        min_samples_leaf=one_parameter,
        oob_score=True,
        n_jobs=-1)
    rf2.fit(x_train,y_train)
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    # 输出log_loss
    y_pre_proba = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)

axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("min_samples_leaf")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()


由图像可知,确定min_samples_leaf=1的时候,表现效果不错

确定最优模型
rf3 = RandomForestClassifier(
    n_estimators=170,
    max_depth=30,
    max_features=15,
    min_samples_leaf=1,
    oob_score=True,
    random_state=40,
    n_jobs=-1)
rf3.fit(x_train,y_train)

rf3.score(x_test,y_test) #0.788367405701123
rf3.oob_score_ #0.7647609447004609
y_pre_probal = rf3.predict_proba(x_test)
log_loss(y_test,y_pre_probal) #0.6964344507957512
生成提交数据
test_data = pd.read_csv("test.csv")
test_data.head()

test_data_drop_id = test_data.drop(["id"],axis=1)
test_data_drop_id.head()

y_pre_test = rf3.predict_proba(test_data_drop_id)
y_pre_test

result_data = pd.Dataframe(y_pre_test,columns=["Class_"+str(i) for i in range(1,10)])
result_data.head()

result_data.insert(loc=0,column="id",value=test_data.id)
result_data.head()

result_data.to_csv("submissson.csv",index=False)
Boosting

实现过程






baggin集成与boosting集成的区别

区别⼀:数据⽅⾯

Bagging:对数据进⾏采样训练;Boosting:根据前⼀轮学习结果调整数据的重要性。 区别⼆:投票⽅⾯

Bagging:所有学习器平权投票;Boosting:对学习器进⾏加权投票。 区别三:学习顺序

Bagging的学习是并⾏的,每个学习器没有依赖关系;Boosting学习是串⾏,学习有先后顺序。 区别四:主要作⽤

Bagging主要⽤于提⾼泛化性能(解决过拟合,也可以说降低⽅差)Boosting主要⽤于提⾼训练精度(解决⽋拟合,也可以说降低偏差) AdaBoost(了解) 构造过程


案例





API

GBDT(了解)

Decision Tree: CART回归树

Gradient Boosting: 拟合负梯度


原理

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/739934.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号