from sklearn.datasets import load_iris # 加载数据集
from sklearn.model_selection import train_test_split # 将数据集分为训练特征值和目标特征值
from sklearn.feature_extraction import DictVectorizer # 提取字典特征值
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # 提取文本特征值
from sklearn.preprocessing import MinMaxScaler, StandardScaler # 将数据归一化/标准化
from sklearn.feature_selection import VarianceThreshold # 删除相关系数较大的特征值
from sklearn.decomposition import PCA # 保留数据的同时降低特征值列的数量
from scipy.stats import pearsonr
import jieba
import pandas as pd
def data_test():
iris = load_iris()
print("查看数据集n", iris)
print("查看特征值名字:n", iris.feature_names)
print("查看特征值:n", iris.data, iris.data.shape)
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)
print(x_train, x_train.shape)
def dict_dome():
"""
字典特征值提取:DictVectorizer
:return:
"""
data = [{'city': '北京', 'temperature': 100}, {'city': '上海', 'temperature': 60}, {'city': '深圳', 'temperature': 30}]
# 1. 实例化一个转换器类
transfer = DictVectorizer(sparse=True)
# sparse 代表稀疏矩阵,将transfer数组中的非0元素表示出来,而且0元素不表示,节省内存
# 2.调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new:", data_new)
print("data_new.toarray:", data_new.toarray())
print("特征值名称:", transfer.feature_names_)
print(transfer.get_feature_names())
def count_dome():
"""
文本特征值提取
:return:
"""
data = ["life is short,i like like python", "life is too long,i dislike python"]
# 实例化一个转换器
transfer = CountVectorizer()
# 调用fit_transform
data_new = transfer.fit_transform(data)
print(data)
print("data_newn", data_new)
print("data_new_toarray", data_new.toarray())
print("feature_name", transfer.get_feature_names())
def count_chinese_dome():
"""
中文文本的特征值提取
:return:
"""
data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
"我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
"如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
data = [cut(x) for x in data]
# 实例化转换器
transfer = CountVectorizer(stop_words=["一种"]) # stop_words去掉某个词的统计
# 调用fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
print(data_new.toarray())
print(transfer.get_feature_names())
def cut(text):
return " ".join(list(jieba.cut(text)))
def tf_idf_dome():
"""
tf-idf文本特征值提取
:return:
"""
data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
"我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
"如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
data = [cut(x) for x in data]
# 实例化转换器
transfer = TfidfVectorizer(stop_words=["一种"]) # stop_words去掉某个词的统计
# 调用fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
print(data_new.toarray())
print(transfer.get_feature_names())
def minmax_dome():
"""
将数据归一化
:return:
"""
# 获取数据
data = pd.read_csv("dating.txt")
data = data.iloc[:, :3]
# 实例化转换器
transfer = MinMaxScaler()
# 调用fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
def standard_demo():
"""
将数据标准化
:return:
"""
# 获取数据
data = pd.read_csv("dating.txt")
data = data.iloc[:, :3]
# 实例化转换器
transfer = StandardScaler()
# 调用fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
def variance_demo():
"""
方差选择法来删除相关系数较大的特征值
:return:
"""
# 获取数据
data = pd.read_csv('factor_returns.csv')
data = data.iloc[:, 1:-2]
print(data.shape)
# 实例化转换器
transfer = VarianceThreshold(threshold=10) # threshold设置样本差异
# 调用fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
print(data_new.shape)
# 计算两个特征之间的相关系数
r = pearsonr(data["pe_ratio"], data["pb_ratio"])
print(r)
def pca_demo():
"""
主成分分析:PCA
:return:
"""
# 获取数据
data = [[2, 3, 4, 5], [3, 6, 7, 8], [5, 6, 8, 9]]
# 实例化转换器
transfer = PCA(n_components=0.95) # 设置保留的数据,小数为百分比,整数为特征值的个数
# 调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new:n", data_new)
if __name__ == '__main__':
# 加载数据集
# data_test()
# 字典特征提取
# dict_dome()
# 英文文字特征提取
# count_dome()
# 中文文字特征提取
# count_chinese_dome()
# 文本特征值提取,提取较重要的词
# tf_idf_dome()
# 特征值归一化
# minmax_dome()
# 特征值标准化
# standard_demo()
# 方差选择法删除相关系数较大的特征值
# variance_demo()
# 保留数据的同时降维
pca_demo()