先AF过滤、LD值选出tagSNP、将最后的变异文件基因型0|0、0|1、1|2等,有无变异转换为0、1特征向量,然后计算距离,存入矩阵dist、用t_SNE降维
from numpy import *
from scipy.spatial.distance import pdist,squareform
import matplotlib.pyplot as plt
from sklearn import manifold
from matplotlib.ticker import NullFormatter
def loadDataSet(fileName,delim=','):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
datArr = [list(map(float,line) )for line in stringArr]
return mat(datArr)
dataMat = loadDataSet('I:/1000GenomeProject/data_genometype/chr22_new_gt.txt')
#取前10000个位点
y=dataMat[:10000]
#二维空间嵌入向量ts.embedding_
#距离计算与压缩矩阵
dist = pdist(y,metric='hamming')
# dist_sq = squareform(dist)
#t-SNE的降维与可视化
#嵌入空间的维度为2,即将数据降维成2维
n_components = 2
# #训练模型
method = manifold.TSNE(n_components=n_components, init='pca',random_state=0)
Y=method.fit_transform(dist)
fig =plt.figure()
ax = fig.add_subplot(111)
ax.scatter(Y[:, 0],Y[:, 1],cmap = plt.cm.Spectral)
ax.xaxis.set_major_formatter(NullFormatter()) # 设置标签显示格式为空
ax.yaxis.set_major_formatter(NullFormatter())
ax.axis('tight')
plt.show()
效果并不好、计划进一步缩小AF取值、增大LD窗口、然后每条染色体随机取1000个位点,建议随机森林处理
待续…



