RNA_Java

一.原始数据的下载

1.下载测序结果

2.下载参考的基因组和注释

https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.0/iwgsc_refseqv1.0_HighConf_2017Mar13.gff3.zip

https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.0/iwgsc_refseqv1.0_HighConf_PROTEIN_2017Mar13.fa.zip

二.对数据进行质检fastqc

fastqc  /public/home/wangyiwei/new_wheat/data/*.fq.gz

2.对质检结果进行汇总

multiqc ./

3.对数据进行过滤

for i in NG_1 NG_2 NG_3 NY_1 NY_2 NY_4 WG_2 WG_3 WY_1 WY_2 WY_3
do
trimmomatic PE -threads 4 
/public/home/wangyiwei/wheat/N_stress_RNA-seq/"$i"_1.clean.fq.gz 
/public/home/wangyiwei/wheat/N_stress_RNA-seq/"$i"_2.clean.fq.gz 
/public/home/wangyiwei/wheat/clean/"$i"_1_paired.clean.fq.gz 
/public/home/wangyiwei/wheat/clean/"$i"_1_unpaired.clean.fq.gz 
/public/home/wangyiwei/wheat/clean/"$i"_2_paired.clean.fq.gz 
/public/home/wangyiwei/wheat/clean/"$i"_2_unpaired.clean.fq.gz 
ILLUMINACLIP:/public/home/wangyiwei/miniconda3/share/trimmomatic-0.39-2/adapters/TruSeq3-PE-2.fa:2:30:10 
LEADING:3 TRAILING:3 HEADCROP:10 
SLIDINGWINDOW:4:20 MINLEN:36
done

三.hisat2

1.建立索引

hisat2-build /public/home/wangyiwei/wheat/ref/iwgsc_refseqv1.0_all_chromosomes/161010_Chinese_Spring_v1.0_pseudomolecules.fasta /public/home/wangyiwei/wheat/wheat_hisat2_index/wheat_v1

2.mapping

mapping 时要添加 --dta 生成stringtie需要的文件

# hisat2 mapping

for i in NG NY WG WY
do
for j in 1 2 3 4
do
hisat2 -t -p 20  --dta -x /public/home/wangyiwei/wheat/wheat_hisat2_index/wheat_v1 
-1 /public/home/wangyiwei/wheat/clean/"$i"_"$j"_1_paired.clean.fq.gz 
-2 /public/home/wangyiwei/wheat/clean/"$i"_"$j"_2_paired.clean.fq.gz 
-S /public/home/wangyiwei/wheat2/mapping/"$i"_"$j".sam
done
done

3.对生成的sam文件进行转换和排序

#samtools

for i in NG NY WG WY
do
for j in 1 2 3 4
do

samtools view -S /public/home/wangyiwei/wheat2/mapping/"$i"_"$j".sam -b > /public/home/wangyiwei/wheat2/samtools/"$i"_"$j".bam
samtools sort /public/home/wangyiwei/wheat2/samtools/"$i"_"$j".bam -o /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam
samtools index -c /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam
samtools flagstat /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam > /public/home/wangyiwei/wheat2/flagstat/"$i"_"$j".bam.flagstat
done
done

4. 对bam文件进行过滤质检，删除重复

#sambamba
for i in NG NY WG WY
do
for j in 1 2 3 4
do
sambamba markdup -r -t 8  /public/home/wangyiwei/wheat2/samtools/"$i"_"$j"_sorted.bam  /public/home/wangyiwei/wheat2/sambamba/"$i"_"$j"_sorted_sambamba.bam
done
done

三. strintie

1.不生成新的转录本

#stringtie
for i in NG NY WG WY
do
for j in 1 2 3 4
do
stringtie -p 8  -G /public/home/wangyiwei/wheat/ref/iwgsc_refseqv1.0_HighConf_2017Mar13.gtf -A /public/home/wangyiwei/wheat2/stringtie/"$i"_"$j"/gene_abundances.tsv  -o /public/home/wangyiwei/wheat2/stringtie/"$i"_"$j"/"$i"_"$j".gtf  -B -e /public/home/wangyiwei/wheat2/sambamba/"$i"_"$j"_sorted_sambamba.bam
done
done

四，deseq2

1.生成需要的counts文件

需要使用一个 prepDE.py 脚本，在 python2/3 环境中使用，下载地址如下：

prepDE.py (python2) : http://ccb.jhu.edu/software/stringtie/dl/prepDE.py
prepDE.py (python3) : http://ccb.jhu.edu/software/stringtie/dl/prepDE.py3

prepDE.py 需要一个 sample_list，第一列为样本名，第二列为 gtf 文件路径

有可能会报错没有权限，授权就好

/public/home/wangyiwei/new_wheat/deseq/prepDE.py3  -i  /public/home/wangyiwei/wheat2/deseq/G/Gsample_list.txt -g gene_count.csv -t transcript.csv

结束后在当前目录生成gene_count_matrix.csv和transcript_count_matrix.csv文件

2，筛选出fpkm

3.选出fpkm-------------------------------------------------------------------------------------------
library(tidyverse)
setwd("/public/home/wangyiwei/wheat2/deseq/G")
gtf1=read.table("/public/home/wangyiwei/wheat2/stringtie/NG_1/gene_abundances.tsv",sep="t")
gtf2=read.table("/public/home/wangyiwei/wheat2/stringtie/NG_2/gene_abundances.tsv",sep="t")
gtf3=read.table("/public/home/wangyiwei/wheat2/stringtie/NG_3/gene_abundances.tsv",sep="t")
gtf4=read.table("/public/home/wangyiwei/wheat2/stringtie/WG_2/gene_abundances.tsv",sep="t")
gtf5=read.table("/public/home/wangyiwei/wheat2/stringtie/WG_3/gene_abundances.tsv",sep="t")
gtf1 = arrange(gtf1,V1)
gtf2 = arrange(gtf2,V1)
gtf3 = arrange(gtf3,V1)
gtf4 = arrange(gtf4,V1)
gtf5 = arrange(gtf5,V1)
nrow(gtf1)
nrow(gtf2)
nrow(gtf3)
nrow(gtf4)
nrow(gtf5)
fpkm=data.frame(gene_name=gtf1$V1,
                FPKM.NG_1=gtf1$V8,
                FPKM.NG_2=gtf2$V8,
                FPKM.NG_3=gtf3$V8,
                FPKM.WG_2=gtf4$V8,
                FPKM.WG_3=gtf5$V8
)
sfpkm=fpkm[-1,]
write.csv(sfpkm,"gene_fpkm.csv")

3.生成差异分析结果

BiocManager::install('DESeq2')
# 加载包
library(DESeq2)
setwd("/public/home/wangyiwei/wheat2/deseq/G")
database=read.csv("gene_count.csv",header = T,row.names = 1)
head(database)
condition<-factor(c('NG','NG','NG','WG','WG'))
coldata<-data.frame(row.names=colnames(database),condition)
head(coldata)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- DESeq(dds)
res <- results(dds)
res <- res[order(res$pvalue),]
diff_res <- as.data.frame(res)
diff_res$gene_name <- rownames(diff_res)
write.table(diff_res,file = 'DESeq2_diff_results.csv',quote = F,sep = ',',row.names = F,col.names = T)

4.给输出文件添加fpkm count 注释

setwd("/public/home/wangyiwei/wheat2/deseq/G")
d = read.csv("/public/home/wangyiwei/wheat2/deseq/G/DESeq2_diff_results.csv")
d = arrange(d,gene_name)
id=read.csv("gene_fpkm.csv")
join=left_join(d,id,by = "gene_name")
write.csv(join,"add_fpkm.csv")
setwd("/public/home/wangyiwei/wheat2/deseq/G")
d = read.csv("add_fpkm.csv")
d = arrange(d,gene_name)
count=read.csv("gene_count.csv")
count = arrange(count,gene_id)
colnames(count)[1] = 'gene_name'
join=left_join(d,count,by = "gene_name")
write.csv(join,"deseq_fpkm_count.csv")
setwd("/public/home/wangyiwei/wheat2/deseq/G")
d=read.csv("deseq_fpkm_count.csv")
nrow(d)
id=read.csv("/public/home/wangyiwei/wheat2/deseq/gene_iwgsc_refseqv1.0_functionalannotation.txt",sep="t")
colnames(id)[2] <- "gene_name"
join=left_join(d,id,by = "gene_name")
write.csv(join,"DESeq2_all_addfunction.csv")

五.图

1.划分上下游，画火山图

#划分上游下游------------------------------------------------------------------------
library(tidyverse)
setwd("E:/360MoveData/Users/DELL/Desktop/text/leaf")
data=read.csv("fpkm1.csv")
data=data.frame(data)
head(data)
cut_off_pvalue = 0.05
cut_off_logFC = 1
data$change=as.factor(ifelse(data$pvalue= cut_off_logFC, 
                             ifelse(data$log2FoldChange >= cut_off_logFC,'Up','Down'),
                             'Stable'))
up_data = subset(data,data$change=="Up")
head(up_data)
write.csv(up_data,"up_data.csv")
down_data = subset(data,data$change=="Down")
write.csv(down_data,"down_data.csv")
head(down_data)

#画火山图------------------------------------------------------------------------------------
ggplot(
  #设置数据
  data, 
  aes(x = log2FoldChange, 
      y = (-log10(pvalue)), 
      colour=change)) +
  geom_point(alpha=2, size=2) +
  #scale_color_manual(values=c("#D6604D", "#d2dae2","#4393C3"))+
  
  # 辅助线
  geom_vline(xintercept=c(-1,1),lty=2,col="black",lwd=0.8) +
  geom_hline(yintercept = c(-log10(0.05)),lty=2,col="black",lwd=0.8) +
  
  # 坐标轴
  labs(x="log2(fold change)",
       y="-log10 (p-value)")+
  theme_bw()+
  
  # 图例
  theme(plot.title = element_text(hjust = 0.5), 
        legend.position="right", 
        legend.title = element_blank()
  )

2.GO分析

把功能注释的文件进行划分

#https://www.jianshu.com/p/2cb4c83f8141
setwd("E:/360MoveData/Users/DELL/Desktop/text/GO")
library(tidyr)
library(stringr)
Func_Anno<-read.table("iwgsc_refseqv1.0_FunctionalAnnotation_v1__HCgenes_v1.0.TAB",sep="t",quote = "",header=TRUE)
GO_Gene<-Func_Anno[,c(1,8)]
#去掉空值行(基因对应GO id为空)
GO_Gene<-GO_Gene[-which(GO_Gene$GO.IDs..Description..via.Interpro==""),]
#GeneID<-sub('..$','',GO_Gene[,1])
GeneID<-data.frame(GO_Gene[,1])
names(GeneID) <- c("GeneID")
head(GeneID)

GO_Gene$Gene.ID<-GeneID
head(GO_Gene)
#拆分第二列数据
GO_Sep<-separate(GO_Gene,col=GO.IDs..Description..via.Interpro,sep =';',remove = TRUE,into=as.character(c(1:20)))
#检查GO是否完全分开(没有增加前面的into范围)
which(!is.na(GO_Sep[,21]))         #判断最后一列是否有非NA值
sum(is.na(GO_Sep[,21]))            #判断最后一列为NA值行数是否与矩阵行一样
GO_Sep_Matr<-data.frame(matrix(NA,300000,2))
k=1
for(i in 1:nrow(GO_Sep)){
  for(j in 2:ncol(GO_Sep)){
    if(is.na(GO_Sep[i,j])==FALSE){
      GO_Sep_Matr[k,1]<-GO_Sep[i,1]
      GO_Sep_Matr[k,2]<-GO_Sep[i,j]
      k=k+1
    }
  }
}
GO_Sep_Matr<-GO_Sep_Matr[-which(apply(GO_Sep_Matr,1,function(x) all(is.na(x)))),]
Extract<-str_split_fixed(GO_Sep_Matr[,2]," ",n=2)
GO_Info<-cbind(GO_Sep_Matr[,1],Extract[,1:2])
#去重复
GO_Info<-unique(GO_Info)
colnames(GO_Info)<-c("GeneID","GO","Function")
GO_Gene<-GO_Info[,1:2]
write.table(GO_Gene,"GO_Gene.txt",col.names = TRUE,row.names = FALSE,quote=FALSE,sep="t")
write.table(GO_Info,"GO_Info.txt",col.names = TRUE,row.names = FALSE,quote=FALSE,sep="t")

删除基因名字后的.1

GO_info <- read.table('GGO_Info.txt',sep = 't',header = TRUE,quote = "")
GO_info[, c("GeneID", "x")] <- str_split_fixed(GO_info$GeneID, "[.]", 2)
GO_info <- subset (GO_info, select = -x)
write.csv(GO_info,"genename_GGO_Info.csv")

把文件按照类型不同划分

GO_info <- read.csv('./gene/genename_GGO_Info.csv')
head(GO_info)

features = separate(data = GO_info, col = Function, into = c("feature", "Function"), sep = ":")
head(features)
write.table(features, file = "genename_features.txt", quote = F,sep="t",row.names = F)



CC_info<-GO_info[grep(pattern = "CC",GO_info[,3]),]
head(CC_info)
write.table(CC_info, file = "CC_gene_info.txt", quote = F,sep="t",row.names = F)
CC_info2<-subset(features,features$feature=="CC")
head(CC_info2)
write.table(CC_info2, file = "CC_gene_feature_info.txt", quote = F,sep="t",row.names = F)




MF_info<-GO_info[grep(pattern = "MF",GO_info[,3]),]
head(MF_info)
write.table(MF_info, file = "MF_gene_info.txt", quote = F,sep="t",row.names = F)
MF_info2<-subset(features,features$feature=="MF")
head(MF_info2)
write.table(MF_info2, file = "MF_gene_feature_info.txt", quote = F,sep="t",row.names = F)




BP_info<-GO_info[grep(pattern = "BP",GO_info[,3]),]
head(BP_info)
write.table(BP_info, file = "BP_gene_info.txt", quote = F,sep="t",row.names = F)
BP_info2<-subset(features,features$feature=="BP")
head(BP_info2)
write.table(BP_info2, file = "BP_gene_feature_info.txt", quote = F,sep="t",row.names = F)

GO分析，三种单独绘图

setwd("E:/360MoveData/Users/DELL/Desktop/text/root/new_up")
library("clusterProfiler")
gene_list = read.csv("up_data.csv")
head(gene_list)
gene_list = subset(gene_list)
gene <- as.character(gene_list$gene_name)  
head(gene)





GO_info <- read.csv('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/genename_GGO_Info.csv')
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "allall.txt", quote = F,sep = 't')





GO_info <- read.table('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/BP_gene_info.txt',sep = 't',header = TRUE,quote = "")
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "BP_go_results.txt", quote = F,sep = 't')
pdf("BP_GO.pdf")
dotplot(x)
dev.off()




GO_info <- read.table('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/CC_gene_info.txt',sep = 't',header = TRUE,quote = "")
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "CC_go_results.txt", quote = F,sep = 't')
pdf("CC_GO.pdf")
dotplot(x)
dev.off()



GO_info <- read.table('E:/360MoveData/Users/DELL/Desktop/text/GO_info/gene/MF_gene_info.txt',sep = 't',header = TRUE,quote = "")
head(GO_info)
term2gene = data.frame(GO_info$GO,GO_info$GeneID)
names(term2gene) <- c("GO", "GeneID")
head(term2gene)
term2name = data.frame(GO_info$GO,GO_info$Function)
names(term2name) <- c("GO", "Function")
head(term2name)
x <- enricher(gene,TERM2GENE=term2gene,TERM2NAME=term2name)
head(x)
x_result<-as.data.frame(x)
x_result = arrange(x_result,-Count)
head(x_result)
write.table(x_result, file = "MF_go_results.txt", quote = F,sep = 't')
pdf("MF_GO.pdf")
dotplot(x)
dev.off()

三种类型分别绘图，每种的前十个

setwd("E:/360MoveData/Users/DELL/Desktop/text/leaf/new_up")
library(RSkittleBrewer) 
library(genefilter)
library(dplyr)
library(devtools)
library(tidyverse)
y=read.table("all.txt",sep = "t",header = TRUE,quote = "")

## 分别后去分号前面和后面的数，并变成数值
forward <- as.numeric(sub("/\d+$", "", y$GeneRatio))
head(forward)
backward <- as.numeric(sub("^\d+/", "", y$GeneRatio))
head(backward)
## 增加数值表示的一列GeneRatio

y$GeneRatio = forward/backward
#预先设定一个筛选阈值，用于控制结果展示的数目，按照p值排序，选取前10个
showCategory =30
font.size =12


pdf("3_all.pdf")
y %>% 
  ## 安装p值排序，选区既定数目的行
  arrange(p.adjust) %>% 
  slice(1:showCategory) %>% 
  ## 开始ggplot2 作图，其中fct_reorder调整因子level的顺序
  ggplot(aes(GeneRatio,forcats::fct_reorder(Description,Count)))+ 
  ## 画出点图
  geom_point(aes(color=p.adjust, size = Count)) +
  ## 调整颜色，guide_colorbar调整色图的方向
  scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE))+
  ## 调整泡泡的大小
  scale_size_continuous(range=c(3, 8))+
  ## 如果用ylab("")或出现左侧空白
  labs(y=NULL) +
  ## 如果没有这一句，上方会到顶
  ggtitle("")+
  scale_y_discrete(labels = function(y) str_wrap(y, width = 40) )+
  ## 设定主题
  theme_bw() +
  theme(axis.text.x = element_text(colour = "black",
                                   size = font.size, vjust =1 ),
        axis.text.y = element_text(colour = "black",
                                   size = font.size, hjust =1 ),
        axis.title = element_text(margin=margin(10, 5, 0, 0),
                                  color = "black",size = font.size),
        axis.title.y = element_text(angle=90))

dev.off()

RNA

Java相关栏目本月热门文章