7.6 沪深300指数走势预测
import os
os.chdir("C:\Users\Administrator\Desktop") #设置路径
import pandas as pd
import numpy as np
7.6.1 读取数据
td=pd.read_excel('index300.xlsx') # 读取数据
td.head(6) # 查看前6行
| Indexcd | Idxtrd01 | Idxtrd02 | Idxtrd03 | Idxtrd04 | Idxtrd05 | Idxtrd06 | Idxtrd07 | Idxtrd08 |
|---|
| 0 | 300 | 2014-01-02 | 2323.43 | 2325.99 | 2310.65 | 2321.98 | 451942.91 | 4901221.11 | -0.3454 |
|---|
| 1 | 300 | 2014-01-03 | 2311.97 | 2314.84 | 2280.89 | 2290.78 | 597826.45 | 5773970.99 | -1.3436 |
|---|
| 2 | 300 | 2014-01-06 | 2286.37 | 2286.37 | 2229.33 | 2238.64 | 663004.03 | 5997936.01 | -2.2762 |
|---|
| 3 | 300 | 2014-01-07 | 2222.31 | 2246.79 | 2218.65 | 2238.00 | 437531.03 | 4256564.81 | -0.0284 |
|---|
| 4 | 300 | 2014-01-08 | 2240.64 | 2262.58 | 2228.42 | 2241.91 | 513488.54 | 5069148.89 | 0.1747 |
|---|
| 5 | 300 | 2014-01-09 | 2236.97 | 2258.89 | 2220.80 | 2222.22 | 559870.41 | 5439949.13 | -0.8783 |
|---|
7.6.2 计算各种指标
A1=td['Idxtrd05'].values/td['Idxtrd05'].rolling(10).mean() #收盘价除以过去10日移动平均收盘价
A2=td['Idxtrd06'].values/td['Idxtrd06'].rolling(10).mean() #交易量除以过去10日移动平均交易量
A3=td['Idxtrd08'].values # 收益率
A4=td['Idxtrd03'].values/td['Idxtrd05'].rolling(10).mean() #最高价除以过去10日移动平均收盘价
A5=td['Idxtrd04'].values/td['Idxtrd05'].rolling(10).mean() #最低价除以过去10日移动平均收盘价
A6=td['Idxtrd03'].values-td['Idxtrd04'].values #极差,最高价-最低价
A7=td['Idxtrd05'].values-td['Idxtrd02'].values #当天收益,收盘价-开盘价
print(A1) #输出A1
print(A5) #输出A5
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
240 1.008022
241 1.033503
242 1.029035
243 1.024952
244 1.042181
Name: Idxtrd05, Length: 245, dtype: float64
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
240 0.975050
241 1.000262
242 1.014258
243 1.014464
244 1.018230
Name: Idxtrd05, Length: 245, dtype: float64
X = {'A1':A1,'A2':A2,'A3':A3,'A4':A4,'A5':A5,'A6':A6,'A7':A7} # 保存成字典形式
X = pd.DataFrame(X) # 转化成数据框
X.head(6)
| A1 | A2 | A3 | A4 | A5 | A6 | A7 |
|---|
| 0 | NaN | NaN | -0.3454 | NaN | NaN | 15.34 | -1.45 |
|---|
| 1 | NaN | NaN | -1.3436 | NaN | NaN | 33.95 | -21.19 |
|---|
| 2 | NaN | NaN | -2.2762 | NaN | NaN | 57.04 | -47.73 |
|---|
| 3 | NaN | NaN | -0.0284 | NaN | NaN | 28.14 | 15.69 |
|---|
| 4 | NaN | NaN | 0.1747 | NaN | NaN | 34.16 | 1.27 |
|---|
| 5 | NaN | NaN | -0.8783 | NaN | NaN | 38.09 | -14.75 |
|---|
X = X.iloc[9:-1,] #数据切片,从第10行开始到最后一行
计算决策变量Y
Y = td['Idxtrd05'].values[1:] - td['Idxtrd05'].values[:-1] #后一日收盘价-前一日收盘价
Y=Y[9:] # 注意错位相减的行数
Y[Y>0]=1 # 后一日收盘价-前一日收盘价大于0,表示上涨,记为1
Y[Y<=0]=-1 # 后一日收盘价-前一日收盘价小于0,表示下跌,记为-1
Y=Y.reshape(len(Y),1)
print(Y)
[[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]]
7.6.3 模型求解
# 1.划分训练集和测试集
x_train=X.iloc[:len(X)-30,:] # 训练集的自变量
Y_train=Y[:len(Y)-30] # 训练集的因变量
x_test=X.iloc[len(X)-30:,:] # 测试集的自变量
Y_test=Y[len(Y)-30:] # 测试集的因变量
# 2.模型求解和检验
支持向量机模型
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(x_train, Y_train)
rv1=clf.score(x_train, Y_train);
R=clf.predict(x_test)
R=R.reshape(len(R),1)
Z=R-Y_test
Rs1=len(Z[Z==0])/len(Z)
C:ProgramDataAnaconda3libsite-packagessklearnutilsvalidation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
逻辑回归
from sklearn.linear_model import LogisticRegression as LR
lr = LR() #创建逻辑回归模型类
lr.fit(x_train, Y_train) #训练数据
rv2=lr.score(x_train, Y_train); # 模型准确率(针对训练数据)
R=lr.predict(x_test)
R=R.reshape(len(R),1)
Z=R-Y_test
Rs2=len(Z[Z==0])/len(Z)
C:ProgramDataAnaconda3libsite-packagessklearnutilsvalidation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
神经网络模型
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,2), random_state=1)
clf.fit(x_train, Y_train);
rv3=clf.score(x_train, Y_train)
R=clf.predict(x_test)
R=R.reshape(len(R),1)
Z=R-Y_test
Rs3=len(Z[Z==0])/len(Z)
C:ProgramDataAnaconda3libsite-packagessklearnutilsvalidation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
print('支持向量机模型准确率:',rv1)
print('逻辑模型准确率:',rv2)
print('神经网络模型准确率:',rv3)
print('---------------------------------------------')
print('支持向量机模型预测准确率:',Rs1)
print('逻辑模型预测准确率:',Rs2)
print('神经网络模型预测准确率:',Rs3)
支持向量机模型准确率: 0.5707317073170731
逻辑模型准确率: 0.5658536585365853
神经网络模型准确率: 0.5073170731707317
---------------------------------------------
支持向量机模型预测准确率: 0.7666666666666667
逻辑模型预测准确率: 0.7666666666666667
神经网络模型预测准确率: 0.8
7.7 基于主成分聚类的上市公司盈利能力分析
7.7.1 数据获取
data = pd.read_excel('财务指标数据.xlsx')
data.head(6)
| Stkcd | Accper | F050502B | F050102B | F050202B | F051201B | F051501B | F053301B | F051401B | F052101B |
|---|
| 0 | 667 | 2015-12-31 | 0.072496 | 0.030630 | 0.025459 | 0.045300 | 0.090794 | 0.287946 | 0.125722 | 0.160675 |
|---|
| 1 | 838 | 2015-12-31 | 0.075536 | 0.020626 | 0.019593 | 0.025699 | 0.116062 | 0.348287 | 0.066612 | 0.112519 |
|---|
| 2 | 600816 | 2015-12-31 | 0.424511 | 0.284350 | 0.284350 | 0.188029 | 0.666531 | NaN | 0.892360 | NaN |
|---|
| 3 | 600358 | 2015-12-31 | 0.042172 | 0.047514 | 0.017612 | 0.051383 | 0.145928 | 0.867484 | 0.393601 | 0.241220 |
|---|
| 4 | 601155 | 2015-12-31 | 0.183725 | 0.042560 | 0.040358 | 0.093632 | 0.101813 | 0.268085 | 0.138837 | 0.172821 |
|---|
| 5 | 2231 | 2015-12-31 | 0.007149 | 0.005577 | 0.005543 | 0.006671 | 0.012131 | 0.251830 | 0.003421 | 0.021860 |
|---|
7.7.2 数据清理
# 切片挑选需要的变量
data2 = data.iloc[:,[ 0,2,3,4,5,6,7,8,9]]
data2 = data2[data2 > 0]
data2 = data2.dropna()
data2 = data2.values
print(data2)
[[6.670000e+02 7.249600e-02 3.063000e-02 ... 2.879460e-01 1.257220e-01
1.606750e-01]
[8.380000e+02 7.553600e-02 2.062600e-02 ... 3.482870e-01 6.661200e-02
1.125190e-01]
[6.003580e+05 4.217200e-02 4.751400e-02 ... 8.674840e-01 3.936010e-01
2.412200e-01]
...
[6.100000e+01 7.233000e-03 1.635800e-02 ... 4.855560e-01 2.796000e-03
8.079900e-02]
[4.020000e+02 8.117500e-02 3.283800e-02 ... 3.005470e-01 2.048840e-01
2.474820e-01]
[2.000540e+05 1.753463e+00 1.647230e-01 ... 6.664500e-02 1.749230e-01
1.550150e-01]]
for i in range(1,9):
data2=data2[data2[:,i]<8*np.mean(data2[:,i]),:] # 每个变量里面如果存在大于8倍该变量均值的数值,则视为异常值,将其去掉
# 选择计算机行业上市公司的数据
dta=pd.read_excel('申万行业分类.xlsx')
stkcd=dta.loc[dta['行业名称'].values=='计算机','股票代码'].values
s=data2[:,0]
I=s==stkcd[0]
for i in range(1,len(stkcd)):
I1=s==stkcd[i]
I=I|I1
ddata=data2[I,:]
X=ddata[:,1:]
# 标准化处理
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X=scaler.transform(X)
7.7.3 主成分分析
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95) #累计贡献率为95%
Y = pca.fit_transform(X)
tzxl = pca.components_ #返回特征向量
tz = pca.explained_variance_ #返回特征值
gxl = pca.explained_variance_ratio_ #返回主成分方差百分比(贡献率)
# 对主成分后的Y标准化
scaler = MinMaxScaler()
scaler.fit(Y)
Y = scaler.transform(Y)
print(Y)
[[1. 0.35271541 0. ]
[0.39016096 0.53879807 0.57018667]
[0.30010244 0.39406656 0.56879495]
[0.27039046 0.2964222 0.46070532]
[0.48672129 0.35397777 0.38666498]
[0.21309201 0.25788551 0.45081498]
[0.27912283 0.54280988 0.51318843]
[0.1572944 0.50142123 0.36082199]
[0.19677837 0.3616009 0.48022782]
[0.38521072 0.49713145 0.72720359]
[0.14710215 0.39917128 0.39988137]
[0.16162013 0.48137942 0.3416155 ]
[0.52232947 0.52497807 0.41118528]
[0.45481913 0.48211103 0.61319047]
[0.02613532 0.40745569 0.46097522]
[0.19409062 0.62824552 0.47880802]
[0.21853564 0.53107475 0.459496 ]
[0.15451249 0.2584657 0.27652781]
[0.08716958 0.45388591 0.41742003]
[0.36001074 0.16638344 0.0750454 ]
[0.25176958 0.3232513 0.30290889]
[0.56960641 0.59926141 0.6247297 ]
[0.1351583 0.29402457 0.53914207]
[0.27232579 0.28188235 0.72458263]
[0.16228433 0.61384212 0.46062233]
[0.2289046 0.66079344 0.48278713]
[0.398054 0.48727234 0.4361368 ]
[0.30570314 0.57964964 0.67379527]
[0.42098929 0. 0.42194875]
[0.12282552 0.31680236 0.39509641]
[0.11417217 0.50357034 0.40670324]
[0.4014705 0.64036035 0.47896145]
[0.25641914 0.75649382 0.50332891]
[0.46193257 0.74745698 0.55891119]
[0.19872465 0.42003761 0.42507615]
[0.06461358 0.49796433 0.41328105]
[0.13887415 0.19118031 0.59227063]
[0.33953678 0.20816981 0.53728959]
[0.28806008 0.5661635 0.37056341]
[0.58050128 0.36931633 1. ]
[0.31451271 0.1343195 0.46261752]
[0.23753148 0.36934836 0.23481502]
[0.03163158 0.32978406 0.49743078]
[0. 0.50050413 0.24093191]
[0.37779225 0.54069871 0.56592613]
[0.28105514 0.40015422 0.49895377]
[0.18592627 0.25201196 0.85142195]
[0.11816703 0.245758 0.58946425]
[0.52577689 1. 0.61892863]
[0.2766132 0.40461408 0.61805954]
[0.36167024 0.13410407 0.99959069]
[0.24300298 0.33816989 0.36047089]
[0.35341 0.41121232 0.35831831]
[0.27428105 0.57993231 0.62323712]
[0.31181231 0.3417833 0.58027724]
[0.2306254 0.46532333 0.49240407]
[0.31088556 0.63316711 0.38927324]
[0.27945894 0.34491699 0.46414593]
[0.1415173 0.33861322 0.55011358]
[0.42270587 0.40673737 0.32336965]
[0.17331604 0.48086027 0.32196112]
[0.39871826 0.65131775 0.61251796]
[0.19795737 0.44479349 0.57140924]
[0.18988548 0.27265764 0.59527533]
[0.64756809 0.26874946 0.20169429]
[0.20789073 0.60402229 0.43318674]
[0.24534499 0.47924495 0.42494554]
[0.21588013 0.62823678 0.47726656]
[0.29152982 0.56142069 0.57746981]
[0.23964301 0.3610168 0.31973727]
[0.14006785 0.38229718 0.51898427]
[0.24160422 0.39956274 0.30514966]
[0.43254606 0.43522082 0.85266599]
[0.10428289 0.42424925 0.51831267]
[0.18135159 0.41260571 0.32491939]
[0.29927567 0.37596953 0.36773508]
[0.28565189 0.15749429 0.48514758]
[0.35741342 0.35168311 0.47948626]
[0.21023831 0.13466273 0.79021532]
[0.27476166 0.52728366 0.42556418]
[0.43864979 0.38382313 0.34667911]
[0.48912006 0.51053069 0.25666985]
[0.5792899 0.48315188 0.47186856]
[0.08946935 0.60192197 0.34586447]
[0.08836171 0.24211162 0.58207393]
[0.43032257 0.56941608 0.54411348]
[0.39972691 0.51887879 0.3129126 ]
[0.45183958 0.14529451 0.38886917]
[0.22710544 0.30796581 0.44015714]
[0.38234097 0.27028947 0.58782501]
[0.37808409 0.29362605 0.30143069]
[0.16424462 0.53304101 0.46728502]
[0.3915905 0.52237116 0.49260511]
[0.15294694 0.41858487 0.43664275]
[0.42325175 0.03416457 0.56357409]
[0.09925005 0.33448112 0.58506441]
[0.38732987 0.00316802 0.12895581]
[0.44630218 0.56498237 0.35338581]
[0.24619225 0.27918856 0.61966313]
[0.18725085 0.23401407 0.49428424]
[0.26648399 0.3242053 0.57301473]
[0.25558567 0.35664406 0.51600676]
[0.24587741 0.15197482 0.56010903]
[0.43851468 0.0645047 0.40736009]
[0.31288722 0.34411938 0.37381343]
[0.20092546 0.37832537 0.43608698]
[0.2744214 0.43821201 0.46568662]
[0.23056008 0.23317136 0.66354976]
[0.20773999 0.49743118 0.43396559]
[0.29313224 0.48525078 0.49252522]
[0.2341254 0.3597684 0.37329531]
[0.05266023 0.50083512 0.39115158]
[0.32669163 0.84759859 0.60215241]
[0.30555832 0.34529031 0.25144697]
[0.37541006 0.4471835 0.41326947]
[0.50835938 0.23696001 0.68037798]
[0.47968784 0.31902553 0.30531284]
[0.47824868 0.50241261 0.53507055]
[0.24190473 0.51344189 0.4013775 ]
[0.09110503 0.27711092 0.53113462]
[0.31704437 0.4750707 0.28918813]
[0.17131301 0.41297434 0.35863972]
[0.46122029 0.28651035 0.76438713]
[0.21306854 0.57312665 0.5193129 ]
[0.31135553 0.35684707 0.46800267]
[0.21752449 0.70726552 0.49147669]
[0.27467357 0.6547978 0.47292997]
[0.5016703 0.42772846 0.51741645]
[0.05422398 0.40111387 0.23208408]
[0.11352917 0.66592656 0.35955919]]
7.7.4 K-Means聚类
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 5, random_state=0, max_iter = 1000)
model.fit(Y)
c=model.labels_
center=model.cluster_centers_
center=pd.DataFrame(center)
center.columns=['Y1','Y2','Y3']
# 将公司名和股票代码对应组合
Fs=pd.Series(c,index=ddata[:,0])
Fs=Fs.sort_values()
co=pd.read_excel('公司基本信息表.xlsx')
co1=pd.Series(co['Stknme'].values,index=co['Stkcd'].values)
# 设置成5类
for i in range(5):
q=co1[Fs[Fs==i].index]
q=pd.DataFrame(q)
q.to_excel('c'+str(i)+'.xlsx')
7.7.5 计算每类公司总利润平均增长率
rd=pd.read_excel('利润数据.xlsx')
r_c=[]
for n in range(5):
cn=list(Fs[Fs==n].index)
r1_n=0
r2_n=0
for t in cn:
I1=rd['Accper'].values=='2014-12-31'
I2=rd['Accper'].values=='2015-12-31'
I3=rd['Stkcd'].values==t
index1=I1&I3
index2=I2&I3
r1=rd.loc[index1,'B002000101'].values
r2=rd.loc[index2,'B002000101'].values
if len(r1)>0:
r1_n=r1_n+r1
if len(r2)>0:
r2_n=r2_n+r2
p2=r2_n/len(cn)
p1=r1_n/len(cn)
r_c.append((p2-p1)/p1)
r_c=np.array(r_c)
dt=np.hstack((center.values,r_c))
dtt=pd.DataFrame(dt)
dtt.columns=['Y1','Y2','Y3','r_c']
print(dtt)
Y1 Y2 Y3 r_c
0 0.229527 0.289527 0.537123 0.055348
1 0.492402 0.251426 0.272615 1.358886
2 0.196778 0.450577 0.376927 0.239998
3 0.367587 0.600588 0.529996 0.340677
4 0.376598 0.266334 0.832905 -0.160602