见:把tcga大计划的CNS级别文章标题画一个词云
词云绘制关于词云绘制较详细的步骤可以参考,之前的博文 https://blog.csdn.net/shy_321/article/details/120567111
2018的TCGA的泛癌项目论文 获取数据文章的标题都在该网页里,
(https://www.cell.com/pb-assets/consortium/pancanceratlas/pancani3/index.html )
可以选择使用复制粘贴的方式(数据不是很多) 或者, 直接解析该网页html的内容,提取标题。
这里采用网页解析的方式。
install.packages("rvest") ###rvest 是R中的一个爬虫包
library(rvest)
### 读取网页, 本来 read_html是可以直接输入url的
### 但是,可能网络原因,或者其他原因把,并不能成功读取
### 所以手动在浏览器上打开 https://www.cell.com/pb-assets/consortium/pancanceratlas/pancani3/index.html ,
### 然后保存网页为tcga_cancer.html, 在读取
html <- read_html("tcga_cancer.html")
#### 这两行,是为了得到标题内容
sections <- html %>% html_nodes("section")
titles <- sections[2:4] %>%
html_nodes("ul > li.journal + li > a") %>% html_text()
titles
---------------------------------
> titles
[1] "Cell-of-Origin Patterns Dominate the Molecular Classification of 10,000 Tumors from 33 Types of Cancer"
[2] "Machine Learning Identifies Stemness Features Associated with oncogenic Dedifferentiation"
[3] "A Comprehensive Pan-Cancer Molecular Study of Gynecologic and Breast Cancers"
[4] "Comparative Molecular Analysis of Gastrointestinal Adenocarcinomas"
[5] "Genomic, Pathway Network, and Immunologic Features Distinguishing Squamous Carcinomas"
[6] "The Cancer Genome Atlas Comprehensive Molecular Characterization of Renal Cell Carcinoma"
[7] "Perspective on oncogenic Processes at the End of the Beginning of Cancer Genomics"
[8] "Pathogenic Germline Variants in 10,389 Adult Cancers"
....................
可能不熟悉的爬虫同学可能会有些迷惑。学爬虫之前,可能需要先学下html。感兴趣的同学可以去另外学习下(网上挺多资料教程的)。
library(tm)
library(wordcloud2)
txt_source <- VectorSource(titles)
docs <- tm::Corpus(txt_source)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stemdocument)
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- TermdocumentMatrix(docs, list(tolower = F))
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v), freq=v)
head(d, 10)
wordcloud2(d, size = 0.4,shape = 'star')
2020的Nature及其子刊的22篇全基因组的泛癌分析
html <- read_html("https://www.nature.com/collections/afdejfafdb/")
titles <- html %>%
html_nodes("articletitle") %>%
html_text()
txt_source <- VectorSource(titles)
docs <- tm::Corpus(txt_source)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stemdocument)
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- TermdocumentMatrix(docs, list(tolower = F))
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v), freq=v)
head(d, 10)
library(wordcloud2)
wordcloud2(d, size = 0.4,shape = 'star')
参考
https://jtr13.github.io/cc19/web-scraping-using-rvest.html



