from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.feature_extraction import DictVectorizer from sklearn.tree import DecisionTreeRegressor,export_graphviz
# 获取数据
titan = pd.read_csv("titanic.csv")
# 数据基本处理
# 确定特征值,目标值
x = titan[["pclass","age","sex"]]
y = titan["survived"]
# 缺失值处理
x["age"].fillna(value=titan["age"].mean(),inplace=True)
# 数据集划分
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=22,test_size=0.2)
# 特征工程-字典特征抽取
x_train = x_train.to_dict(orient="records")
x_test = x_test.to_dict(orient="records")
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)
# 机器学习-决策树
estimator = DecisionTreeRegressor(max_depth=5)
estimator.fit(x_train,y_train)
# 模型评估
print("得分:n",estimator.score(x_test,y_test))
rf = RandomForestClassifier()
# 通过超参数调优
param = {"n_estimators":[100,120,300],"max_depth":[3,7,11]}
gc = GridSearchCV(rf,param_grid=param,cv=3)
gc.fit(x_train,y_train)
print("随机森林预测结果是:n",gc.score(x_test,y_test))
otto案例-Otto Group Product
链接
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from imblearn.under_sampling import RandomUnderSampler from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import log_loss from sklearn.preprocessing import oneHotEncoder数据获取
data = pd.read_csv("train.csv")
data.head()
数据基本处理
## 数据类别不均衡 sns.countplot(data.target) plt.show()
# 随机欠采样获取数据 ## 确定特征值,目标值 y = data["target"] x = data.drop(["id","target"],axis=1) x.head(),y.head()
## 欠采样获取数据 rus = RandomUnderSampler(random_state=0) X_resampled,Y_resampled = rus.fit_resample(x,y) sns.countplot(Y_resampled) plt.show()
# 把标签值转换为数字 le = LabelEncoder() Y_resampled = le.fit_transform(Y_resampled) # 分割数据 x_train,x_test,y_train,y_test = train_test_split(X_resampled,Y_resampled,test_size=0.2,random_state=22) x_train.shape,y_train.shape,x_test.shape,y_test.shape模型训练
## 开启包外估计 rf = RandomForestClassifier(oob_score=True) rf.fit(x_train,y_train) y_pre = rf.predict(x_test) score = rf.score(x_test,y_test) rf.oob_score_ #0.7587845622119815 score1 #0.7840483731644111评分
# logloss 参数要求one-hot格式 one_hot = OneHotEncoder(sparse=False) y_test1 = one_hot.fit_transform(y_test.reshape(-1,1)) y_pre1 = one_hot.fit_transform(y_pre.reshape(-1,1)) log_loss(y_test1,y_pre1,eps=1e-15,normalize=True)
7.4587049513916055
改变预测值输出模式,让输出结果为百分占比,减低logloss值
y_pre_probae = rf.predict_proba(x_test) y_pre_probae
rf.oob_score_ #0.7587845622119815 log_loss(y_test1,y_pre_probae,eps=1e-15,normalize=True)模型调优 确定最优的n_estimators
tuned_parameters = range(10,200,10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=one_parameter,
max_depth=10,
max_features=10,
min_samples_leaf=10,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("n_estimators")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定n_estimators=170的时候,表现效果不错
tuned_parameters = range(5,40,5)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=170,
max_depth=10,
max_features=one_parameter,
min_samples_leaf=10,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("max_features")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定max_features=15的时候,表现效果不错
tuned_parameters = range(10,100,10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=170,
max_depth=one_parameter,
max_features=15,
min_samples_leaf=10,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("max_depth")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定max_depth=30的时候,表现效果不错
tuned_parameters = range(1,10,2)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=170,
max_depth=30,
max_features=15,
min_samples_leaf=one_parameter,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("min_samples_leaf")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定min_samples_leaf=1的时候,表现效果不错
rf3 = RandomForestClassifier(
n_estimators=170,
max_depth=30,
max_features=15,
min_samples_leaf=1,
oob_score=True,
random_state=40,
n_jobs=-1)
rf3.fit(x_train,y_train)
rf3.score(x_test,y_test) #0.788367405701123
rf3.oob_score_ #0.7647609447004609
y_pre_probal = rf3.predict_proba(x_test)
log_loss(y_test,y_pre_probal) #0.6964344507957512
生成提交数据
test_data = pd.read_csv("test.csv")
test_data.head()
test_data_drop_id = test_data.drop(["id"],axis=1) test_data_drop_id.head()
y_pre_test = rf3.predict_proba(test_data_drop_id) y_pre_test
result_data = pd.Dataframe(y_pre_test,columns=["Class_"+str(i) for i in range(1,10)]) result_data.head()
result_data.insert(loc=0,column="id",value=test_data.id) result_data.head()
result_data.to_csv("submissson.csv",index=False)
Boosting
实现过程
区别⼀:数据⽅⾯
Bagging:对数据进⾏采样训练;Boosting:根据前⼀轮学习结果调整数据的重要性。 区别⼆:投票⽅⾯
Bagging:所有学习器平权投票;Boosting:对学习器进⾏加权投票。 区别三:学习顺序
Bagging的学习是并⾏的,每个学习器没有依赖关系;Boosting学习是串⾏,学习有先后顺序。 区别四:主要作⽤
Bagging主要⽤于提⾼泛化性能(解决过拟合,也可以说降低⽅差)Boosting主要⽤于提⾼训练精度(解决⽋拟合,也可以说降低偏差) AdaBoost(了解) 构造过程



