import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from IPython.display import Image import os %matplotlib inline
notes:
- 使用 seaborn 进行图形化展示
# PIL的参数设置 # 设置界面的中文的字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用大家更加适应的方式显示负号 plt.rcParams['axes.unicode_minus'] = False # 设置输出图片的尺寸 plt.rcParams['figure.figsize'] = (10, 6)
# 读取训练数据集 file_path = os.path.join(os.getcwd(), '..', 'datasets') file_name = 'train.csv' file_url = os.path.join(file_path, file_name) data = pd.read_csv(file_url) data.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# 对数据缺失值进行补充
# 对分类的值进行填充
data['Cabin'] = data['Cabin'].fillna('NA')
data['Embarked'] = data['Embarked'].fillna('S')
# 对连续值进行填充
age_mean = data['Age'].mean()
data['Age'] = data['Age'].fillna(age_mean)
# 检查缺失值的比例
data.isnull().sum().sort_values(ascending=False)
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64
notes:
- SeriesObject.fillna(‘content’)
使用此函数填补 NAN 值
# 编码分类变量
data_feature_values = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'
, 'Fare', 'Embarked']]
print(data_feature_values.head())
# 对分类的(离散型)的列进行数值转换
data_feature_values = pd.get_dummies(data_feature_values)
print(data_feature_values.head())
Pclass Sex Age SibSp Parch Fare Embarked 0 3 male 22.0 1 0 7.2500 S 1 1 female 38.0 1 0 71.2833 C 2 3 female 26.0 0 0 7.9250 S 3 1 female 35.0 1 0 53.1000 S 4 3 male 35.0 0 0 8.0500 S Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C 0 3 22.0 1 0 7.2500 0 1 0 1 1 38.0 1 0 71.2833 1 0 1 2 3 26.0 0 0 7.9250 1 0 0 3 1 35.0 1 0 53.1000 1 0 0 4 3 35.0 0 0 8.0500 0 1 0 Embarked_Q Embarked_S 0 0 1 1 0 0 2 0 1 3 0 1 4 0 1
notes:
- pd.get_dummies(df)
使用其可以对一些分数值的列进行转换
1.
from sklearn.model_selection import train_test_split X = data_feature_values y = data['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) print(y.count(), y.sum(), y.sum()/y.count()) print(X_train.shape, X_test.shape)
891 342 0.3838383838383838 0.250280583613917 (668, 10) (223, 10)
notes:
- 使用 tain_test_split函数对数据集进行分割
- 可以使用 train_size=所在比例 进行划分也可以使用 test_size=所占比例 进行划分stratify=y, 将按照 y 中标签的比例进行划分,即,训练集和测试集中的标签占比结果和整个标签列标签占比相同
from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier
print('======== X_train, y_train ========')
print(X_train.head())
print('-------------')
print(y_train.head())
print('========= end ==========')
# 使用逻辑回归进行训练
lr = LogisticRegression()
lr.fit(X_train, y_train)
# 查看训练集和测试集得分值
print('Training set score: ', lr.score(X_train, y_train))
print('Testing set score: ', lr.score(X_test, y_test))
======== X_train, y_train ========
Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C
671 1 31.0 1 0 52.000 0 1 0
417 2 18.0 0 2 13.000 1 0 0
634 3 9.0 3 2 27.900 1 0 0
323 2 22.0 1 1 29.000 1 0 0
379 3 19.0 0 0 7.775 0 1 0
Embarked_Q Embarked_S
671 0 1
417 0 1
634 0 1
323 0 1
379 0 1
-------------
671 0
417 1
634 0
323 1
379 0
Name: Survived, dtype: int64
========= end ==========
Training set score: 0.8023952095808383
Testing set score: 0.7847533632286996
C:UsersWuDiXDesktoptemp12-winter-vacation12-01-dataWhaleEnv-datawhale-01libsite-packagessklearnlinear_model_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
- lr.score 返回的是 决定系数,决定系数的值是:
lr.fit()
- 函数中第一个参数是一个 df, 第二个 Series
pred = lr.predict(X_test) pred[:10]
array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)
#预测标签概率 pred_proba = lr.predict_proba(X_test) print(X_test.head()) pred_proba[:10]
Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C
288 2 42.0 0 0 13.0000 0 1 0
869 3 4.0 1 1 11.1333 0 1 0
182 3 9.0 4 2 31.3875 0 1 0
684 2 60.0 1 1 39.0000 0 1 0
599 1 49.0 1 0 56.9292 0 1 1
Embarked_Q Embarked_S
288 0 1
869 0 1
182 0 1
684 0 1
599 0 0
array([[0.84995574, 0.15004426],
[0.84233073, 0.15766927],
[0.94909621, 0.05090379],
[0.94252973, 0.05747027],
[0.70411225, 0.29588775],
[0.50580607, 0.49419393],
[0.40320661, 0.59679339],
[0.733531 , 0.266469 ],
[0.87814836, 0.12185164],
[0.87546022, 0.12453978]])
# 交叉验证 from sklearn.model_selection import cross_val_score lr = LogisticRegression(C=100) scores = cross_val_score(lr, X_train,y_train, cv=10) print(scores)
[0.82089552 0.7761194 0.82089552 0.79104478 0.85074627 0.86567164 0.73134328 0.86567164 0.75757576 0.6969697 ]



