一.原始数据的下载
1.下载测序结果
2.下载参考的基因组和注释
https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.0/iwgsc_refseqv1.0_HighConf_2017Mar13.gff3.zip
https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.0/iwgsc_refseqv1.0_HighConf_PROTEIN_2017Mar13.fa.zip
二.对数据进行质检fastqc
1.
fastqc /public/home/wangyiwei/new_wheat/data/*.fq.gz
2.对质检结果进行汇总
multiqc ./
3.对数据进行过滤
for i in NG_1 NG_2 NG_3 NY_1 NY_2 NY_4 WG_2 WG_3 WY_1 WY_2 WY_3 do trimmomatic PE -threads 4 /public/home/wangyiwei/wheat/N_stress_RNA-seq/"$i"_1.clean.fq.gz /public/home/wangyiwei/wheat/N_stress_RNA-seq/"$i"_2.clean.fq.gz /public/home/wangyiwei/wheat/clean/"$i"_1_paired.clean.fq.gz /public/home/wangyiwei/wheat/clean/"$i"_1_unpaired.clean.fq.gz /public/home/wangyiwei/wheat/clean/"$i"_2_paired.clean.fq.gz /public/home/wangyiwei/wheat/clean/"$i"_2_unpaired.clean.fq.gz ILLUMINACLIP:/public/home/wangyiwei/miniconda3/share/trimmomatic-0.39-2/adapters/TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 HEADCROP:10 SLIDINGWINDOW:4:20 MINLEN:36 done
三.hisat2
1.建立索引
hisat2-build /public/home/wangyiwei/wheat/ref/iwgsc_refseqv1.0_all_chromosomes/161010_Chinese_Spring_v1.0_pseudomolecules.fasta /public/home/wangyiwei/wheat/wheat_hisat2_index/wheat_v1
2.mapping
mapping 时要添加 --dta 生成stringtie需要的文件
# hisat2 mapping for i in NG NY WG WY do for j in 1 2 3 4 do hisat2 -t -p 20 --dta -x /public/home/wangyiwei/wheat/wheat_hisat2_index/wheat_v1 -1 /public/home/wangyiwei/wheat/clean/"$i"_"$j"_1_paired.clean.fq.gz -2 /public/home/wangyiwei/wheat/clean/"$i"_"$j"_2_paired.clean.fq.gz -S /public/home/wangyiwei/wheat2/mapping/"$i"_"$j".sam done done
3.对生成的sam文件进行转换和排序
#samtools for i in NG NY WG WY do for j in 1 2 3 4 do samtools view -S /public/home/wangyiwei/wheat2/mapping/"$i"_"$j".sam -b > /public/home/wangyiwei/wheat2/samtools/"$i"_"$j".bam samtools sort /public/home/wangyiwei/wheat2/samtools/"$i"_"$j".bam -o /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam samtools index -c /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam samtools flagstat /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam > /public/home/wangyiwei/wheat2/flagstat/"$i"_"$j".bam.flagstat done done
4. 对bam文件进行过滤质检,删除重复
#sambamba for i in NG NY WG WY do for j in 1 2 3 4 do sambamba markdup -r -t 8 /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam /public/home/wangyiwei/wheat2/sambamba/"$i"_"$j"_sorted_sambamba.bam done done
三. strintie
1.不生成新的转录本
#stringtie for i in NG NY WG WY do for j in 1 2 3 4 do stringtie -p 8 -G /public/home/wangyiwei/wheat/ref/iwgsc_refseqv1.0_HighConf_2017Mar13.gtf -A /public/home/wangyiwei/wheat2/stringtie/"$i"_"$j"/gene_abundances.tsv -o /public/home/wangyiwei/wheat2/stringtie/"$i"_"$j"/"$i"_"$j".gtf -B -e /public/home/wangyiwei/wheat2/sambamba/"$i"_"$j"_sorted_sambamba.bam done done
四,deseq2
1.生成需要的counts文件
需要使用一个 prepDE.py 脚本,在 python2/3 环境中使用,下载地址如下:
-
prepDE.py (python2) : http://ccb.jhu.edu/software/stringtie/dl/prepDE.py
-
prepDE.py (python3) : http://ccb.jhu.edu/software/stringtie/dl/prepDE.py3
prepDE.py 需要一个 sample_list,第一列为样本名,第二列为 gtf 文件路径
有可能会报错没有权限,授权就好
/public/home/wangyiwei/new_wheat/deseq/prepDE.py3 -i /public/home/wangyiwei/wheat2/deseq/G/Gsample_list.txt -g gene_count.csv -t transcript.csv
结束后在当前目录生成gene_count_matrix.csv和transcript_count_matrix.csv文件
2,筛选出fpkm
3.选出fpkm-------------------------------------------------------------------------------------------
library(tidyverse)
setwd("/public/home/wangyiwei/wheat2/deseq/G")
gtf1=read.table("/public/home/wangyiwei/wheat2/stringtie/NG_1/gene_abundances.tsv",sep="t")
gtf2=read.table("/public/home/wangyiwei/wheat2/stringtie/NG_2/gene_abundances.tsv",sep="t")
gtf3=read.table("/public/home/wangyiwei/wheat2/stringtie/NG_3/gene_abundances.tsv",sep="t")
gtf4=read.table("/public/home/wangyiwei/wheat2/stringtie/WG_2/gene_abundances.tsv",sep="t")
gtf5=read.table("/public/home/wangyiwei/wheat2/stringtie/WG_3/gene_abundances.tsv",sep="t")
gtf1 = arrange(gtf1,V1)
gtf2 = arrange(gtf2,V1)
gtf3 = arrange(gtf3,V1)
gtf4 = arrange(gtf4,V1)
gtf5 = arrange(gtf5,V1)
nrow(gtf1)
nrow(gtf2)
nrow(gtf3)
nrow(gtf4)
nrow(gtf5)
fpkm=data.frame(gene_name=gtf1$V1,
FPKM.NG_1=gtf1$V8,
FPKM.NG_2=gtf2$V8,
FPKM.NG_3=gtf3$V8,
FPKM.WG_2=gtf4$V8,
FPKM.WG_3=gtf5$V8
)
sfpkm=fpkm[-1,]
write.csv(sfpkm,"gene_fpkm.csv")
3.生成差异分析结果
BiocManager::install('DESeq2')
# 加载包
library(DESeq2)
setwd("/public/home/wangyiwei/wheat2/deseq/G")
database=read.csv("gene_count.csv",header = T,row.names = 1)
head(database)
condition<-factor(c('NG','NG','NG','WG','WG'))
coldata<-data.frame(row.names=colnames(database),condition)
head(coldata)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- DESeq(dds)
res <- results(dds)
res <- res[order(res$pvalue),]
diff_res <- as.data.frame(res)
diff_res$gene_name <- rownames(diff_res)
write.table(diff_res,file = 'DESeq2_diff_results.csv',quote = F,sep = ',',row.names = F,col.names = T)
4.给输出文件添加fpkm count 注释
setwd("/public/home/wangyiwei/wheat2/deseq/G")
d = read.csv("/public/home/wangyiwei/wheat2/deseq/G/DESeq2_diff_results.csv")
d = arrange(d,gene_name)
id=read.csv("gene_fpkm.csv")
join=left_join(d,id,by = "gene_name")
write.csv(join,"add_fpkm.csv")
setwd("/public/home/wangyiwei/wheat2/deseq/G")
d = read.csv("add_fpkm.csv")
d = arrange(d,gene_name)
count=read.csv("gene_count.csv")
count = arrange(count,gene_id)
colnames(count)[1] = 'gene_name'
join=left_join(d,count,by = "gene_name")
write.csv(join,"deseq_fpkm_count.csv")
setwd("/public/home/wangyiwei/wheat2/deseq/G")
d=read.csv("deseq_fpkm_count.csv")
nrow(d)
id=read.csv("/public/home/wangyiwei/wheat2/deseq/gene_iwgsc_refseqv1.0_functionalannotation.txt",sep="t")
colnames(id)[2] <- "gene_name"
join=left_join(d,id,by = "gene_name")
write.csv(join,"DESeq2_all_addfunction.csv")
五.图
1.划分上下游,画火山图
#划分上游下游------------------------------------------------------------------------
library(tidyverse)
setwd("E:/360MoveData/Users/DELL/Desktop/text/leaf")
data=read.csv("fpkm1.csv")
data=data.frame(data)
head(data)
cut_off_pvalue = 0.05
cut_off_logFC = 1
data$change=as.factor(ifelse(data$pvalue= cut_off_logFC,
ifelse(data$log2FoldChange >= cut_off_logFC,'Up','Down'),
'Stable'))
up_data = subset(data,data$change=="Up")
head(up_data)
write.csv(up_data,"up_data.csv")
down_data = subset(data,data$change=="Down")
write.csv(down_data,"down_data.csv")
head(down_data)
#画火山图------------------------------------------------------------------------------------
ggplot(
#设置数据
data,
aes(x = log2FoldChange,
y = (-log10(pvalue)),
colour=change)) +
geom_point(alpha=2, size=2) +
#scale_color_manual(values=c("#D6604D", "#d2dae2","#4393C3"))+
# 辅助线
geom_vline(xintercept=c(-1,1),lty=2,col="black",lwd=0.8) +
geom_hline(yintercept = c(-log10(0.05)),lty=2,col="black",lwd=0.8) +
# 坐标轴
labs(x="log2(fold change)",
y="-log10 (p-value)")+
theme_bw()+
# 图例
theme(plot.title = element_text(hjust = 0.5),
legend.position="right",
legend.title = element_blank()
)
2.GO分析
把功能注释的文件进行划分
#https://www.jianshu.com/p/2cb4c83f8141
setwd("E:/360MoveData/Users/DELL/Desktop/text/GO")
library(tidyr)
library(stringr)
Func_Anno<-read.table("iwgsc_refseqv1.0_FunctionalAnnotation_v1__HCgenes_v1.0.TAB",sep="t",quote = "",header=TRUE)
GO_Gene<-Func_Anno[,c(1,8)]
#去掉空值行(基因对应GO id为空)
GO_Gene<-GO_Gene[-which(GO_Gene$GO.IDs..Description..via.Interpro==""),]
#GeneID<-sub('..$','',GO_Gene[,1])
GeneID<-data.frame(GO_Gene[,1])
names(GeneID) <- c("GeneID")
head(GeneID)
GO_Gene$Gene.ID<-GeneID
head(GO_Gene)
#拆分第二列数据
GO_Sep<-separate(GO_Gene,col=GO.IDs..Description..via.Interpro,sep =';',remove = TRUE,into=as.character(c(1:20)))
#检查GO是否完全分开(没有增加前面的into范围)
which(!is.na(GO_Sep[,21])) #判断最后一列是否有非NA值
sum(is.na(GO_Sep[,21])) #判断最后一列为NA值行数是否与矩阵行一样
GO_Sep_Matr<-data.frame(matrix(NA,300000,2))
k=1
for(i in 1:nrow(GO_Sep)){
for(j in 2:ncol(GO_Sep)){
if(is.na(GO_Sep[i,j])==FALSE){
GO_Sep_Matr[k,1]<-GO_Sep[i,1]
GO_Sep_Matr[k,2]<-GO_Sep[i,j]
k=k+1
}
}
}
GO_Sep_Matr<-GO_Sep_Matr[-which(apply(GO_Sep_Matr,1,function(x) all(is.na(x)))),]
Extract<-str_split_fixed(GO_Sep_Matr[,2]," ",n=2)
GO_Info<-cbind(GO_Sep_Matr[,1],Extract[,1:2])
#去重复
GO_Info<-unique(GO_Info)
colnames(GO_Info)<-c("GeneID","GO","Function")
GO_Gene<-GO_Info[,1:2]
write.table(GO_Gene,"GO_Gene.txt",col.names = TRUE,row.names = FALSE,quote=FALSE,sep="t")
write.table(GO_Info,"GO_Info.txt",col.names = TRUE,row.names = FALSE,quote=FALSE,sep="t")
删除基因名字后的.1
GO_info <- read.table('GGO_Info.txt',sep = 't',header = TRUE,quote = "")
GO_info[, c("GeneID", "x")] <- str_split_fixed(GO_info$GeneID, "[.]", 2)
GO_info <- subset (GO_info, select = -x)
write.csv(GO_info,"genename_GGO_Info.csv")
把文件按照类型不同划分
GO_info <- read.csv('./gene/genename_GGO_Info.csv')
head(GO_info)
features = separate(data = GO_info, col = Function, into = c("feature", "Function"), sep = ":")
head(features)
write.table(features, file = "genename_features.txt", quote = F,sep="t",row.names = F)
CC_info<-GO_info[grep(pattern = "CC",GO_info[,3]),]
head(CC_info)
write.table(CC_info, file = "CC_gene_info.txt", quote = F,sep="t",row.names = F)
CC_info2<-subset(features,features$feature=="CC")
head(CC_info2)
write.table(CC_info2, file = "CC_gene_feature_info.txt", quote = F,sep="t",row.names = F)
MF_info<-GO_info[grep(pattern = "MF",GO_info[,3]),]
head(MF_info)
write.table(MF_info, file = "MF_gene_info.txt", quote = F,sep="t",row.names = F)
MF_info2<-subset(features,features$feature=="MF")
head(MF_info2)
write.table(MF_info2, file = "MF_gene_feature_info.txt", quote = F,sep="t",row.names = F)
BP_info<-GO_info[grep(pattern = "BP",GO_info[,3]),]
head(BP_info)
write.table(BP_info, file = "BP_gene_info.txt", quote = F,sep="t",row.names = F)
BP_info2<-subset(features,features$feature=="BP")
head(BP_info2)
write.table(BP_info2, file = "BP_gene_feature_info.txt", quote = F,sep="t",row.names = F)
GO分析,三种单独绘图
setwd("E:/360MoveData/Users/DELL/Desktop/text/root/new_up")
library("clusterProfiler")
gene_list = read.csv("up_data.csv")
head(gene_list)
gene_list = subset(gene_list)
gene <- as.character(gene_list$gene_name)
head(gene)
GO_info <- read.csv('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/genename_GGO_Info.csv')
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "allall.txt", quote = F,sep = 't')
GO_info <- read.table('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/BP_gene_info.txt',sep = 't',header = TRUE,quote = "")
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "BP_go_results.txt", quote = F,sep = 't')
pdf("BP_GO.pdf")
dotplot(x)
dev.off()
GO_info <- read.table('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/CC_gene_info.txt',sep = 't',header = TRUE,quote = "")
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "CC_go_results.txt", quote = F,sep = 't')
pdf("CC_GO.pdf")
dotplot(x)
dev.off()
GO_info <- read.table('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/MF_gene_info.txt',sep = 't',header = TRUE,quote = "")
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "MF_go_results.txt", quote = F,sep = 't')
pdf("MF_GO.pdf")
dotplot(x)
dev.off()
三种类型分别绘图,每种的前十个
setwd("E:/360MoveData/Users/DELL/Desktop/text/leaf/new_up")
library(RSkittleBrewer)
library(genefilter)
library(dplyr)
library(devtools)
library(tidyverse)
y=read.table("all.txt",sep = "t",header = TRUE,quote = "")
## 分别后去分号前面和后面的数,并变成数值
forward <- as.numeric(sub("/\d+$", "", y$GeneRatio))
head(forward)
backward <- as.numeric(sub("^\d+/", "", y$GeneRatio))
head(backward)
## 增加数值表示的一列GeneRatio
y$GeneRatio = forward/backward
#预先设定一个筛选阈值,用于控制结果展示的数目,按照p值排序,选取前10个
showCategory =30
font.size =12
pdf("3_all.pdf")
y %>%
## 安装p值排序,选区既定数目的行
arrange(p.adjust) %>%
slice(1:showCategory) %>%
## 开始ggplot2 作图,其中fct_reorder调整因子level的顺序
ggplot(aes(GeneRatio,forcats::fct_reorder(Description,Count)))+
## 画出点图
geom_point(aes(color=p.adjust, size = Count)) +
## 调整颜色,guide_colorbar调整色图的方向
scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE))+
## 调整泡泡的大小
scale_size_continuous(range=c(3, 8))+
## 如果用ylab("")或出现左侧空白
labs(y=NULL) +
## 如果没有这一句,上方会到顶
ggtitle("")+
scale_y_discrete(labels = function(y) str_wrap(y, width = 40) )+
## 设定主题
theme_bw() +
theme(axis.text.x = element_text(colour = "black",
size = font.size, vjust =1 ),
axis.text.y = element_text(colour = "black",
size = font.size, hjust =1 ),
axis.title = element_text(margin=margin(10, 5, 0, 0),
color = "black",size = font.size),
axis.title.y = element_text(angle=90))
dev.off()



