非原创,代码来源葁sir
import numpy as np import pandas as pd from pandas import Series,DataFrame
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor # Ada的回归& GBDT的回归
from sklearn.datasets import load_boston # 波士顿房价
from sklearn.neighbors import KNeighborsRegressor
boston = load_boston() data = boston.data target = boston.target feature_names = boston.feature_names
# 建立普通的knn模型进行比较 knn = KNeighborsRegressor()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data, target, test_size=0.2, random_state=1)
X_train = DataFrame(data=X_train,columns=feature_names) X_train
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.14150 | 0.0 | 6.91 | 0.0 | 0.448 | 6.169 | 6.6 | 5.7209 | 3.0 | 233.0 | 17.9 | 383.37 | 5.81 |
| 1 | 0.15445 | 25.0 | 5.13 | 0.0 | 0.453 | 6.145 | 29.2 | 7.8148 | 8.0 | 284.0 | 19.7 | 390.68 | 6.86 |
| 2 | 16.81180 | 0.0 | 18.10 | 0.0 | 0.700 | 5.277 | 98.1 | 1.4261 | 24.0 | 666.0 | 20.2 | 396.90 | 30.81 |
| 3 | 0.05646 | 0.0 | 12.83 | 0.0 | 0.437 | 6.232 | 53.7 | 5.0141 | 5.0 | 398.0 | 18.7 | 386.40 | 12.34 |
| 4 | 8.79212 | 0.0 | 18.10 | 0.0 | 0.584 | 5.565 | 70.6 | 2.0635 | 24.0 | 666.0 | 20.2 | 3.65 | 17.16 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 399 | 0.03548 | 80.0 | 3.64 | 0.0 | 0.392 | 5.876 | 19.1 | 9.2203 | 1.0 | 315.0 | 16.4 | 395.18 | 9.25 |
| 400 | 0.09164 | 0.0 | 10.81 | 0.0 | 0.413 | 6.065 | 7.8 | 5.2873 | 4.0 | 305.0 | 19.2 | 390.91 | 5.52 |
| 401 | 5.87205 | 0.0 | 18.10 | 0.0 | 0.693 | 6.405 | 96.0 | 1.6768 | 24.0 | 666.0 | 20.2 | 396.90 | 19.37 |
| 402 | 0.33045 | 0.0 | 6.20 | 0.0 | 0.507 | 6.086 | 61.5 | 3.6519 | 8.0 | 307.0 | 17.4 | 376.75 | 10.88 |
| 403 | 0.08014 | 0.0 | 5.96 | 0.0 | 0.499 | 5.850 | 41.5 | 3.9342 | 5.0 | 279.0 | 19.2 | 396.90 | 8.77 |
404 rows × 13 columns
X_train.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CRIM | 404.0 | 3.697455 | 9.146743 | 0.00632 | 0.082598 | 0.234405 | 3.594927 | 88.9762 |
| ZN | 404.0 | 11.527228 | 23.288284 | 0.00000 | 0.000000 | 0.000000 | 20.000000 | 100.0000 |
| INDUS | 404.0 | 11.077500 | 6.848412 | 0.46000 | 5.190000 | 9.125000 | 18.100000 | 27.7400 |
| CHAS | 404.0 | 0.079208 | 0.270398 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.0000 |
| NOX | 404.0 | 0.553026 | 0.116895 | 0.38500 | 0.448000 | 0.535000 | 0.624000 | 0.8710 |
| RM | 404.0 | 6.268792 | 0.689229 | 3.56100 | 5.876750 | 6.179000 | 6.626500 | 8.7800 |
| AGE | 404.0 | 67.935644 | 28.563186 | 2.90000 | 43.250000 | 76.800000 | 93.825000 | 100.0000 |
| DIS | 404.0 | 3.826111 | 2.120999 | 1.12960 | 2.105350 | 3.298600 | 5.141475 | 12.1265 |
| RAD | 404.0 | 9.470297 | 8.680237 | 1.00000 | 4.000000 | 5.000000 | 24.000000 | 24.0000 |
| TAX | 404.0 | 403.257426 | 169.030480 | 187.00000 | 277.000000 | 329.000000 | 666.000000 | 711.0000 |
| PTRATIO | 404.0 | 18.438614 | 2.169469 | 12.60000 | 17.225000 | 19.000000 | 20.200000 | 22.0000 |
| B | 404.0 | 357.153688 | 91.541647 | 0.32000 | 376.092500 | 391.575000 | 396.157500 | 396.9000 |
| LSTAT | 404.0 | 12.778540 | 7.216403 | 1.73000 | 7.092500 | 11.465000 | 17.102500 | 37.9700 |
X_train.min() # 没有负值 可以使用区缩放发 压缩到01
CRIM 0.00632 ZN 0.00000 INDUS 0.46000 CHAS 0.00000 NOX 0.38500 RM 3.56100 AGE 2.90000 DIS 1.12960 RAD 1.00000 TAX 187.00000 PTRATIO 12.60000 B 0.32000 LSTAT 1.73000 dtype: float64
from sklearn.preprocessing import MinMaxScaler # 区缩放法:压缩数据到01之间
mms = MinMaxScaler() data = mms.fit_transform(X_train)
feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='
X_train = pd.DataFrame(data=data,columns=feature_names)
X_train
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT 0 0.001519 0.00 0.236437 0.0 0.129630 0.499713 0.038105 0.417509 0.086957 0.087786 0.563830 0.965883 0.112583 1 0.001665 0.25 0.171188 0.0 0.139918 0.495114 0.270855 0.607917 0.304348 0.185115 0.755319 0.984316 0.141556 2 0.188890 0.00 0.646628 0.0 0.648148 0.328799 0.980433 0.026962 1.000000 0.914122 0.808511 1.000000 0.802428 3 0.000564 0.00 0.453446 0.0 0.106996 0.511784 0.523172 0.353236 0.173913 0.402672 0.648936 0.973524 0.292770 4 0.098750 0.00 0.646628 0.0 0.409465 0.383982 0.697219 0.084924 1.000000 0.914122 0.808511 0.008397 0.425773 ... ... ... ... ... ... ... ... ... ... ... ... ... ... 399 0.000328 0.80 0.116569 0.0 0.014403 0.443572 0.166838 0.735726 0.000000 0.244275 0.404255 0.995663 0.207506 400 0.000959 0.00 0.379399 0.0 0.057613 0.479785 0.050463 0.378079 0.130435 0.225191 0.702128 0.984896 0.104581 401 0.065929 0.00 0.646628 0.0 0.633745 0.544932 0.958805 0.049759 1.000000 0.914122 0.808511 1.000000 0.486755 402 0.003643 0.00 0.210411 0.0 0.251029 0.483809 0.603502 0.229365 0.304348 0.229008 0.510638 0.949191 0.252483 403 0.000830 0.00 0.201613 0.0 0.234568 0.438590 0.397528 0.255036 0.173913 0.175573 0.702128 1.000000 0.194260
404 rows × 13 columns
knn.fit(X_train,y_train)
KNeighborsRegressor()
# 评判回归问题 用什么指标:score? mae mse
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train,knn.predict(X_train))
15.028524752475246
mean_squared_error(y_test,knn.predict(X_test))
204.1549686274509
实验:adaboosting
aba = AdaBoostRegressor(base_estimator=KNeighborsRegressor(),n_estimators=100)
aba.fit(X_train,y_train)
AdaBoostRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100)
mean_squared_error(y_train,aba.predict(X_train))
4.755455445544554
mean_squared_error(y_test,aba.predict(X_test))
127.99668627450978
# 每一个基学习器上在样本集上的预测结果
err_list = []
for i,y_ in enumerate(aba.staged_predict(X_test)):
err = mean_squared_error(y_test,y_)
err_list.append(err)
print('C{}:ERROR:{}'.format(i,err))
C0:ERROR:199.32693725490194
C1:ERROR:199.32693725490194
C2:ERROR:220.0897333333333
C3:ERROR:199.7595607843137
C4:ERROR:207.813894117647
C5:ERROR:207.82781176470584
C6:ERROR:192.8428588235294
C7:ERROR:190.94257647058825
C8:ERROR:192.6760862745098
C9:ERROR:189.0082431372549
C10:ERROR:188.7143254901961
C11:ERROR:184.35898823529416
C12:ERROR:179.3270117647059
C13:ERROR:178.614568627451
C14:ERROR:166.1954274509804
C15:ERROR:169.03231372549018
C16:ERROR:158.3203725490196
C17:ERROR:158.4866470588235
C18:ERROR:142.9087137254902
C19:ERROR:158.1416
C20:ERROR:142.4846666666667
C21:ERROR:142.4846666666667
C22:ERROR:157.30826666666667
C23:ERROR:157.38923529411767
C24:ERROR:157.35952941176473
C25:ERROR:142.61410196078435
C26:ERROR:135.35090980392155
C27:ERROR:142.68588627450984
C28:ERROR:135.77389411764707
C29:ERROR:142.68588627450984
C30:ERROR:142.04365098039216
C31:ERROR:141.93987843137253
C32:ERROR:135.87796470588236
C33:ERROR:141.93987843137253
C34:ERROR:141.82033725490197
C35:ERROR:141.95123529411765
C36:ERROR:135.73551764705883
C37:ERROR:141.9422549019608
C38:ERROR:135.90520784313725
C39:ERROR:141.4849843137255
C40:ERROR:135.90342352941178
C41:ERROR:142.09143921568628
C42:ERROR:135.91702352941178
C43:ERROR:136.00622745098042
C44:ERROR:135.89461960784314
C45:ERROR:142.01684705882352
C46:ERROR:135.91702352941178
C47:ERROR:136.18957647058824
C48:ERROR:136.2971137254902
C49:ERROR:136.19823137254903
C50:ERROR:134.7982470588235
C51:ERROR:136.41440000000003
C52:ERROR:136.29789803921568
C53:ERROR:136.4323411764706
C54:ERROR:134.9813450980392
C55:ERROR:136.3194980392157
C56:ERROR:136.40856078431375
C57:ERROR:136.41440000000003
C58:ERROR:142.32514901960786
C59:ERROR:136.48449803921568
C60:ERROR:142.32336470588237
C61:ERROR:142.43412941176473
C62:ERROR:142.40421176470588
C63:ERROR:136.38720784313728
C64:ERROR:142.4454392156863
C65:ERROR:136.4293843137255
C66:ERROR:142.46783137254903
C67:ERROR:136.48109019607844
C68:ERROR:142.5336549019608
C69:ERROR:144.18005490196077
C70:ERROR:142.50667450980393
C71:ERROR:136.4272549019608
C72:ERROR:142.50667450980393
C73:ERROR:142.5022117647059
C74:ERROR:142.5354392156863
C75:ERROR:136.4629137254902
C76:ERROR:136.31138823529412
C77:ERROR:135.0016
C78:ERROR:136.49930196078432
C79:ERROR:135.03171764705883
C80:ERROR:135.01974901960781
C81:ERROR:128.98411372549018
C82:ERROR:135.078631372549
C83:ERROR:129.0248196078431
C84:ERROR:135.078631372549
C85:ERROR:128.98411372549018
C86:ERROR:135.078631372549
C87:ERROR:128.9571333333333
C88:ERROR:128.96369803921567
C89:ERROR:128.6254862745098
C90:ERROR:129.0248196078431
C91:ERROR:128.6512274509804
C92:ERROR:128.39080392156865
C93:ERROR:128.39080392156865
C94:ERROR:128.40443529411766
C95:ERROR:128.68032549019608
C96:ERROR:128.39080392156865
C97:ERROR:128.2014862745098
C98:ERROR:128.39080392156865
C99:ERROR:127.99668627450978
# 展示误差的变化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
plt.plot(err_list)
[]
# 获取每一个基学习器的错误率
plt.plot(aba.estimator_errors_)
[]
# 每一个基学习器的权重
plt.plot(aba.estimator_weights_)
[]
GBDT观察表现
from sklearn.ensemble import RandomForestRegressor
# max_depth = None 是完全生长的决策树
RandomForestRegressor()
# 看gbdt的情况 深度限制为3 max_depth=3 需要一个弱学习器
gbdt = GradientBoostingRegressor(n_estimators=100)
gbdt.fit(X_train,y_train)
GradientBoostingRegressor()
mean_squared_error(y_train,gbdt.predict(X_train))
1.7840841714565248
mean_squared_error(y_test,gbdt.predict(X_test))
159.79621357980093
特征评估结果
# aba.feature_importances_ # knn作为基学习器的时候 没有这个对象 但是可以换基学习器为逻辑斯蒂回归等
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
D:softwareanacondalibsite-packagessklearnensemble_weight_boosting.py in feature_importances_(self)
253 norm = self.estimator_weights_.sum()
--> 254 return (sum(weight * clf.feature_importances_ for weight, clf
255 in zip(self.estimator_weights_, self.estimators_))
D:softwareanacondalibsite-packagessklearnensemble_weight_boosting.py in (.0)
253 norm = self.estimator_weights_.sum()
--> 254 return (sum(weight * clf.feature_importances_ for weight, clf
255 in zip(self.estimator_weights_, self.estimators_))
AttributeError: 'KNeighborsRegressor' object has no attribute 'feature_importances_'
The above exception was the direct cause of the following exception:
AttributeError Traceback (most recent call last)
~AppDataLocalTemp/ipykernel_12168/2668898732.py in
----> 1 aba.feature_importances_
D:softwareanacondalibsite-packagessklearnensemble_weight_boosting.py in feature_importances_(self)
257
258 except AttributeError as e:
--> 259 raise AttributeError(
260 "Unable to compute feature importances "
261 "since base_estimator does not have a "
AttributeError: Unable to compute feature importances since base_estimator does not have a feature_importances_ attribute
pd.Series(data=gbdt.feature_importances_,index=feature_names).plot(kind='bar')



