# Calculate gene signature score
### ---------------
###
### Create: Yuan.Sh
### Date: 2021-11-22 16:35:19
### Email: yuansh3354@163.com
### Blog: https://blog.csdn.net/qq_40966210
### Fujian Medical University
### Ref : https://github.com/carmonalab/UCell_demo
###
### ---------------
############# Step.00 #############
# clean
rm(list = ls())
gc()
# packages
library(Seurat)
library(stringr)
library(scales)
library(magrittr)
library(RColorBrewer)
library(ggplot2)
library(UCell)
# myfunctions
myhead = function(x,DT=FALSE){
print(paste('Data dimension: ',dim(x)[1],dim(x)[2]))
if(dim(x)[2]>5){print(x[1:5,1:5])}
else(print(head(x)))
if(DT){DT::datatable(x,filter='top')}
}
mycolors = c("#1a476f","#90353b","#55752f","#e37e00","#6e8e84",
"#c10534","#938dd2","#cac27e","#a0522d","#7b92a8",
"#2d6d66","#9c8847","#bfa19c","#ffd200","#d9e6eb",
"#4DBBD5B2", "#00A087B2", "#3C5488B2",
"#F39B7FB2", "#8491B4B2","#91D1C2B2","#DC0000B2",
"#7E6148B2","#E64B35B2",'#698EC3')
show_col(mycolors)
# options
options(stringsAsFactors = F)
options(as.is = T)
setwd('/media/yuansh/14THHD/BscModel-V4/supplementary materials/DEGs')
# main
df = read.csv('GSE131907-Cancer_absolute_scale_wilcoxon.csv',row.names = 1)
df = df[which( (abs(df$logfoldchanges) > 2) & (df$pvals_adj < 0.05) ),]
#############
#####读取前n行数据
https://www.jb51.net/article/210396.htm
readfile<-function(file, n=1000, header=F){
pt <- file(file, "r")
name <- NULL
if(header){
name <- strsplit(readLines(pt, 1), split=',')[[1]]; #读取标题
f1 <- readLines(pt, n)
data <- read.table(text=f1, sep=',', col.names=name)
}else{
f1 <- readLines(pt, n)
data <- read.table(text=f1, sep=',')
}
close(pt)
data
}
cls = readfile('pyscience/auc_mtx.csv',n=1)
表格查看数据框
数据重塑(宽转长):
melt函数是reshape2包中的数据宽转长的函数
mydata<-melt(
mydata, #待转换的数据集名称
id.vars=c("Conpany","Name"), #要保留的主字段
variable.name="Year", #转换后的分类字段名称(维度)
value.name="Sale" #转换后的度量值名称
)
在tidyr包中的gather也可以非常快捷的完成宽转长的任务:
data1<-gather(
data=mydata, #待转换的数据集名称
key="Year", #转换后的分类字段名称(维度)
value="Sale" , #转换后的度量值名称
Sale2013:Sale2016 #选择将要被拉长的字段组合
) #(可以使用x:y的格式选择连续列,也可以以-z的格式排除主字段)
2、reshape2中的dcast函数可以完成数据长转宽的需求:
dcast( data=data1, #数据集名称 Name+Conpany~Year #x1+x2+……~class #这一项是一个转换表达式,表达式左侧列 #出要保留的主字段(即不会被扩宽的字段,右侧则是要分割的分类变量,扩展之后的 #宽数据会增加若干列度量值,列数等于表达式右侧分类变量的类别个数 )
DT::datatable(df,filter='top')R包的安装
rm(list = ls())
options()$repos
options()$BioC_mirror
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options()$repos
options()$BioC_mirror
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
## 先用这个,这个是 R 包最通用的下载方法
## 一般常规来说,如果能挂梯子就不要用这种方法,太麻烦了
## repos 是指定镜像
install.packages("export", dependencies = T, repos='https://mran.microsoft.com/snapshot/2019-02-01/')
## 生物学分析包用这个下载
BiocManager::install("export",ask = F,update = F, force = T)
## 如果是存放在 github 上的包就用这个
## 这个就是强烈建议用终端下载
library(devtools)
devtools::install_github("OHDSI/Achilles")
分割字符
library(stringr) t=str_split(colnames(expset),'\.',simplify = T)groupby
df <- group_by(df,[' typeof','variable'])
df <- summarise(df,
count = n(), #个数
max_mon = max(value), #最大值
min_mon = min(value), #最小值
avg_sales = mean(value), #平均值
sum_sales = sum(value)) #求和
R语言将图片保存为 PPT
library(eoffice)
library(ggplot2)
filen <- tempfile(pattern = "ggplot")
x=qplot(Sepal.Length, Petal.Length, data = iris,
color = Species,size = Petal.Width, alpha = I(0.7))
topptx(x,file="/home/yuansh/Desktop/gg.pptx")
library(export)
library(ggplot2)
filen <- tempfile(pattern = "ggplot")
x=qplot(Sepal.Length, Petal.Length, data = iris,
color = Species,size = Petal.Width, alpha = I(0.7))
graph2ppt(x=x, file=filen)
将多个图片保存为PDF并合成后输出
library(pdftools)
#创建一个文件夹来存放每篇文章的首页
dir.create("cover")
#假设所有的文章都存在ATAC这个文件夹中
#获取ATAC文件夹中的所有pdf文件
pdfs<-list.files("plot",full.names = T)
for(i in seq_along(pdfs)){
#pages控制提取的页面,2:5就是从第二页到第五页
pdf_subset(pdfs[i], pages = 1:1, output = paste0("cover/",i,".pdf"))
}
#获取cover文件夹中所有的pdf文件
covers<-list.files("cover",full.names = T)
#合并成一个pdf文件
pdf_combine(covers, output = "joined_covers.pdf")
assign函数的使用
setwd('/Volumes/Lexar/阿尔兹海默症/最终结果/IPA/xls')
library(magrittr)
library(stringr)
library(xlsx)
options(stringsAsFactors = F)
options(as.is = T)
# 把 xls 格式转为 csv
# 注意一一下,xls 第一行是没用的要把它删掉
if(F){
list = dir()
for(i in 1:length(list)){
df = read.xlsx(list[i],sheetIndex = 1)
colnames(df) = df[1,]
df = df[-1,] %>% as.data.frame()
file = paste0('../csv/',gsub('.xls','.csv',list[i]))
write.csv(df,file)
}
}
rm(list= ls())
setwd('/Volumes/Lexar/阿尔兹海默症/最终结果/IPA/csv')
# 批量导入数据
### 只读取 63060 的数据
list = dir(pattern = "63060_833mRNA")
### 用文件名字定义变量
ids = gsub('63060_833mRNA_','',list) %>% gsub('.csv','',.) %>% gsub(' ','_',.)
# 使用 assign 函数批量读取
for(i in 1:length(list)){
assign(ids[i], read.csv(list[i],header = T,row.names = 1))
}
###### 读取 assign
# 用 assign 读取的数据名储存在 ids 中
# 然后可以使用 get 函数将 ids 的数据提取出来
get(ids[1])
重复的探针取均值 以及 基因 ID 转换
genes <- mapIds(org.Hs.eg.db,keys=rownames(df),column="SYMBOL",keytype="ENSEMBL",multiVals="first") %>% as.data.frame()
# 如果有重复取均值
df <- t(sapply(split(row.names(genes), genes[,1]), function(ids){
colMeans(df[ids,,drop=FALSE])
}))
library("clusterProfiler")
library("org.Hs.eg.db")
genes <- bitr(genes, fromType = "SYMBOL", #fromType是指你的数据ID类型是属于哪一类的
toType = c("ENTREZID", "SYMBOL"), #toType是指你要转换成哪种ID类型,可以写多种,也可以只写一种
OrgDb = org.Hs.eg.db)#Orgdb是指对应的注释包是哪个
生存曲线
rm(list=ls())
setwd("C:\Users\yuansh\Desktop")
dir()
library(survival)
library(survminer)
library(magrittr)
annotate_x <- 20
plot_title_name = paste0(id1,'-',id2)
df = cbind(rawdata, label)
survival_table=df
meta = survival_table
colnames(meta) = c('ID','time',"event",'group')
meta = na.omit(meta)
sfit1 <- survfit(Surv(time, event)~group, data=meta) #primary_ER/CTC_ER
summary = summary(coxph(Surv(time, event)~group, data=meta))
HR = round(summary$coefficients[2],2)
CI95 = round(summary$conf.int[3:4],2)
LogRankP = signif(summary$sctest[3],digits = 3)
pp = paste("LogRank p = ",LogRankP)
HHRR = paste("HR = ",HR,"( 95% CI:",CI95[1],"-",CI95[2],")")
ggsurvplot(
sfit1,palette =c("#1EB2A6","#F67575"),
pval =TRUE,
font.y=14,
pval.size = 8,
pval.coord = c(0.2,0.1),
conf.int = T,
risk.table ='nrisk_cumcensor',
legend.labs =c("Low Rate", "High Rate"),
xlab ="Time[months]",
ylab ="Overall Survival",
legend.title = "PDL-1 Positive+++ rate",
legend = c(0.9,0.9),
risk.table.title = 'Number at Risk n(number censored)'
)+theme_survminer(
base_family = "Times New Roman",
base_size = 14,
font.main = c(14, "bold",'black'),
font.submain = c(14, "bold",'black'),
font.caption = c(14, "bold",'black'),
font.x = c(14, "bold",'black'),
font.y = c(14, "bold",'black'),
font.tickslab = c(14, "bold",'black'),
font.legend = c(14, "bold",'black')
)
res$table <- res$table +
theme(axis.line = element_blank(),
axis.text.x = element_text(family = "serif",face ="bold"),
plot.title = element_text(family = "serif",face ="bold.italic"),
axis.title.x = element_text(family = "serif",face ="bold.italic" )
)
res$plot <- res$plot +
labs(title=plot_title_name) +
theme(axis.title.x = element_text(family = "serif",face ="bold.italic",size=20),
axis.title.y = element_text(family = "serif",face ="bold.italic",size=20),
axis.text.x = element_text(family = "serif",face ="bold"),
axis.text.y = element_text(family = "serif",face ="bold"),
plot.title = element_text(hjust = 0.5,size=20),
legend.text = element_text(size = 20)
)
res$plot <- res$plot +
annotate("text", x = annotate_x,y= 0.15,
label = pp,size = 6,family = "serif",
colour = "black", fontface = "bold" #,fontface = "italic"
) +
annotate("text", x = annotate_x,y= 0.08,
label = HHRR,size = 6,family = "serif",
colour = "black",fontface = "bold" #,fontface = "italic",
)+
annotate("text", x = annotate_x,y= 0.01,
label = " C-index = 0.6933",size = 6,family = "serif",
colour = "black",fontface = "bold" #,fontface = "italic",
)
res
mclapply多核多线程处理
mclapply(pt.obj_index, function(x, sce) {
sce <- sce[, x]
norms <- as.matrix(t(assays(sce)$norms))
rd <- prcomp(norms, scale. = FALSE)$x[, 1:2]
reducedDims(sce) <- SimpleList(PCA = rd)
fit <- slingshot(sce, reducedDim = 'PCA', clusterLabels = "prelabel")
tbl <- tibble(cell = colnames(sce), pseudotime = scales::rescale(colData(fit)$slingPseudotime_1))
## Make sure the direction is the same as the original pseudotime
merge.tbl <- left_join(tbl, pt.obj_ori_tbl, by = "cell")
if(cor(merge.tbl$pseudotime.x, merge.tbl$pseudotime.y) < 0) {
tbl <- dplyr::mutate(tbl, pseudotime = 1-pseudotime)
}
tbl
}, sce = pt.obj)
把 xls 格式转为 csv
if(F){
list = dir()
for(i in 1:length(list)){
df = read.xlsx(list[i],sheetIndex = 1)
colnames(df) = df[1,]
df = df[-1,] %>% as.matrix()
# 分别提取 z 和 p
df1 = df[,2:3] %>% set_rownames(df[,1])
mode(df1) = 'numeric'
df2 = df[,5:6] %>% set_rownames(df[,4])
mode(df2) = 'numeric'
# 确保行名一致
ids = intersect(rownames(df1),rownames(df2))
df1 = df1[ids,]
df2 = df2[ids,]
# 整理列名
colnames(df1) = paste('Z_score',colnames(df1),sep = '_')
colnames(df2) = paste('P_value',colnames(df2),sep = '_')
df = data.frame(df1,df2) %>% na.omit()
file = paste0('csv/',gsub('.xls','.csv',list[i]))
write.csv(df,file)
}
}
热图
通路富集
PCA,TSNE
ggplot 主题设置



