import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multiclass import oneVsRestClassifier
from sklearn.svm import SVC
if __name__ == "__main__":
# 获取鸢尾属植物数据集
iris = load_iris()
# 直接读到pandas的数据框中
df = pd.Dataframe(data=iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
#head sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
plt.style.use('ggplot')
#鸢尾花瓣的宽度条形图
plt.figure(figsize=(10, 4))
features = iris.feature_names # 4个特征的名称
plt.xlabel(features[2])
plt.ylabel(features[3])
plt.title('Iris Data Hist')
plt.hist(df['petal width'])
plt.savefig('Iris Data hist.png', dpi=200)
plt.show()
X = iris.data # 只包括样本的特征,150x4
y = iris.target # 样本的类型,[0, 1, 2]
features = iris.feature_names # 4个特征的名称
targets = iris.target_names # 3类鸢尾花的名称,跟y中的3个数字对应
# 散点图
plt.figure(figsize=(10, 4))
plt.plot(X[:, 2][y == 0], X[:, 3][y == 0], 'bs', label=targets[0])
plt.plot(X[:, 2][y == 1], X[:, 3][y == 1], 'kx', label=targets[1])
plt.plot(X[:, 2][y == 2], X[:, 3][y == 2], 'ro', label=targets[2])
plt.xlabel(features[2])
plt.ylabel(features[3])
plt.title('Iris Data Set')
plt.legend()
plt.savefig('Iris Data Set.png', dpi=200)
plt.show()
# seaborn
sns.pairplot(df, hue="label",diag_kind='hist')
plt.savefig("pairplot.png")
plt.show()
# 小提琴图
fig, ax = plt.subplots(2, 2, figsize=(7, 7))
sns.set(style='white', palette='muted')
sns.violinplot(x=df['label'], y=df['sepal length'], ax=ax[0, 0])
sns.violinplot(x=df['label'], y=df['sepal width'], ax=ax[0, 1])
sns.violinplot(x=df['label'], y=df['petal length'], ax=ax[1, 0])
sns.violinplot(x=df['label'], y=df['petal width'], ax=ax[1, 1])
for i in ax.flat:
plt.setp(i.get_xticklabels(), rotation=-90)
fig.tight_layout()
fig.show()
# 线性回归
fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(df['sepal width'][:50], df['sepal length'][:50])
ax.set_ylabel('Sepal length')
ax.set_xlabel('Sepal width')
ax.set_title('Setosa sepal width vs sepal length', fontsize=14, y=1.02)
plt.savefig("Setosa sepal width vs sepal length.png")
plt.show
y = df['sepal length'][:50]
x = df['sepal width'][:50]
X = sm.add_constant(x)
results = sm.OLS(y, X).fit()
print(results.summary())
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, results.fittedvalues, label='regression line')
ax.scatter(x, y, label='data point', color='r')
ax.set_ylabel('Sepal length')
ax.set_xlabel('Sepal width')
ax.set_title('Setosa sepal width vs sepal length', fontsize=14, y=1.02)
ax.legend(loc=2)
plt.savefig("Setosa.png")
plt.show()
# scikit-learn
clf = RandomForestClassifier(max_depth=5, n_estimators=10)
X = df.iloc[:, :4]
y = df.iloc[:, 4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rf = pd.Dataframe(list(zip(y_pred, y_test)), columns=['predicted', 'actual'])
rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis= 1)
print(rf)
print(rf['correct'].sum() / rf['correct'].count())
# 预测能力、特征重要性
f_importances = clf.feature_importances_
f_names = df.columns[:4]
f_std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
zz = zip(f_importances, f_names, f_std)
zzs = sorted(zz, key=lambda x: x[0], reverse=True)
imps = [x[0] for x in zzs]
labels = [x[1] for x in zzs]
errs = [x[2] for x in zzs]
plt.bar(range(len(f_importances)), imps, color='r', yerr=errs, align='center')
plt.xticks(range(len(f_importances)), labels)
plt.savefig("f_importances.png")
plt.show()
# SVM
clf = oneVsRestClassifier(SVC(kernel='linear'))
X = df.iloc[:, :4]
y = np.array(df.iloc[:, 4]).astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rf = pd.Dataframe(list(zip(y_pred, y_test)), columns=['predicted', 'actual'])
rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1)
print(rf)
print(rf['correct'].sum() / rf['correct'].count())
PandasMatlabSeabornStatsmodelsScikit-learn



