机器学习之鸢尾花实例

import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multiclass import oneVsRestClassifier
from sklearn.svm import SVC

if __name__ == "__main__":
    # 获取鸢尾属植物数据集
    iris = load_iris()

    # 直接读到pandas的数据框中
    df = pd.Dataframe(data=iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']

    #head sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)

    plt.style.use('ggplot')

    #鸢尾花瓣的宽度条形图
    plt.figure(figsize=(10, 4))
    features = iris.feature_names  # 4个特征的名称
    plt.xlabel(features[2])
    plt.ylabel(features[3])
    plt.title('Iris Data Hist')
    plt.hist(df['petal width'])
    plt.savefig('Iris Data hist.png', dpi=200)
    plt.show()


    X = iris.data  # 只包括样本的特征，150x4
    y = iris.target  # 样本的类型，[0, 1, 2]
    features = iris.feature_names  # 4个特征的名称
    targets = iris.target_names  # 3类鸢尾花的名称，跟y中的3个数字对应

    # 散点图
    plt.figure(figsize=(10, 4))
    plt.plot(X[:, 2][y == 0], X[:, 3][y == 0], 'bs', label=targets[0])
    plt.plot(X[:, 2][y == 1], X[:, 3][y == 1], 'kx', label=targets[1])
    plt.plot(X[:, 2][y == 2], X[:, 3][y == 2], 'ro', label=targets[2])
    plt.xlabel(features[2])
    plt.ylabel(features[3])
    plt.title('Iris Data Set')
    plt.legend()
    plt.savefig('Iris Data Set.png', dpi=200)
    plt.show()

    # seaborn
    sns.pairplot(df, hue="label",diag_kind='hist')
    plt.savefig("pairplot.png")
    plt.show()

    # 小提琴图
    fig, ax = plt.subplots(2, 2, figsize=(7, 7))
    sns.set(style='white', palette='muted')
    sns.violinplot(x=df['label'], y=df['sepal length'], ax=ax[0, 0])
    sns.violinplot(x=df['label'], y=df['sepal width'], ax=ax[0, 1])
    sns.violinplot(x=df['label'], y=df['petal length'], ax=ax[1, 0])
    sns.violinplot(x=df['label'], y=df['petal width'], ax=ax[1, 1])
    for i in ax.flat:
        plt.setp(i.get_xticklabels(), rotation=-90)
    fig.tight_layout()
    fig.show()

    # 线性回归
    fig, ax = plt.subplots(figsize=(7, 7))
    ax.scatter(df['sepal width'][:50], df['sepal length'][:50])
    ax.set_ylabel('Sepal length')
    ax.set_xlabel('Sepal width')
    ax.set_title('Setosa sepal width vs sepal length', fontsize=14, y=1.02)
    plt.savefig("Setosa sepal width vs sepal length.png")
    plt.show

    y = df['sepal length'][:50]
    x = df['sepal width'][:50]
    X = sm.add_constant(x)
    results = sm.OLS(y, X).fit()
    print(results.summary())

    fig, ax = plt.subplots(figsize=(7, 7))
    ax.plot(x, results.fittedvalues, label='regression line')
    ax.scatter(x, y, label='data point', color='r')
    ax.set_ylabel('Sepal length')
    ax.set_xlabel('Sepal width')
    ax.set_title('Setosa sepal width vs sepal length', fontsize=14, y=1.02)
    ax.legend(loc=2)
    plt.savefig("Setosa.png")
    plt.show()

    # scikit-learn
    clf = RandomForestClassifier(max_depth=5, n_estimators=10)
    X = df.iloc[:, :4]
    y = df.iloc[:, 4]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    rf = pd.Dataframe(list(zip(y_pred, y_test)), columns=['predicted', 'actual'])
    rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis= 1)
    print(rf)
    print(rf['correct'].sum() / rf['correct'].count())

    # 预测能力、特征重要性
    f_importances = clf.feature_importances_
    f_names = df.columns[:4]
    f_std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    zz = zip(f_importances, f_names, f_std)
    zzs = sorted(zz, key=lambda x: x[0], reverse=True)
    imps = [x[0] for x in zzs]
    labels = [x[1] for x in zzs]
    errs = [x[2] for x in zzs]
    plt.bar(range(len(f_importances)), imps, color='r', yerr=errs, align='center')
    plt.xticks(range(len(f_importances)), labels)
    plt.savefig("f_importances.png")
    plt.show()

    # SVM
    clf = oneVsRestClassifier(SVC(kernel='linear'))
    X = df.iloc[:, :4]
    y = np.array(df.iloc[:, 4]).astype(str)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    rf = pd.Dataframe(list(zip(y_pred, y_test)), columns=['predicted', 'actual'])
    rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1)
    print(rf)
    print(rf['correct'].sum() / rf['correct'].count())
PandasMatlabSeabornStatsmodelsScikit-learn
机器学习之鸢尾花实例

Python相关栏目本月热门文章