《2021博客之星年度总评选》数据采集Java样例程序
文章目录
- pom.xml
- 2020线上投票博客之星数据采集|样例程序
-
- 2020投票贡献排行榜数据采集|样例程序
-
- 2021线上评分TOP90数据采集|样例程序
-
- 博主博客文章统计|样例程序
-
pom.xml
4.0.0
1
_psimplemvn
1.0-SNAPSHOT
org.apache.maven.plugins
maven-compiler-plugin
8
8
org.seleniumhq.selenium
selenium-chrome-driver
4.0.0
org.apache.poi
poi-ooxml
3.17
2020线上投票博客之星数据采集|样例程序
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class BlogStarStatisticsTest {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "blog_star2020";
private static String sheetname = filename.toUpperCase();
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList blogStars;
private static String url = "https://bss.csdn.net/m/topic/blog_star2020";//blog_star2020 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//
//单条数据DOM结构
// -
//
// 001
//
//
//
// ✎ℳ๓₯㎕...雲淡風輕
// 码龄6年
//
//
2020年度原创博文:77 篇
// 当前票数: 392 票
//
//
// 投TA一票
//
// 为TA拉票
//
//
//
//
blogStars = new ArrayList();
//稍等页面渲染完成
Thread.sleep(3000);
List search_results = driver.findElements(By.xpath("/
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class BlogStarStatisticsVoteLeaderboardList {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "aa518189";
private static String sheetname = filename;
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList blogVotes;
private static String url = "https://bss.csdn.net/m/topic/blog_star2020/detail?username=aa518189";
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//
//单条数据DOM结构
// -
//
// 1
// swagLi
//
//
//
//
// 码龄4年
// 36票
//
//
//
blogVotes = new ArrayList();
//稍等页面渲染完成
Thread.sleep(2000);
List search_results = driver.findElements(By.xpath("/
package simple.call.blogstar;
import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class Blogstar2021 {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
//预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException
private static String filename = "blogstar2021";
private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss");
private static String suffix = ".xlsx";
//用于保留Excel中的原内容
private static FileInputStream inputStream;
//用于往Excel中追加写入新内容
private static FileOutputStream outputStream;
private static ArrayList blogStars;
private static String url = "https://www.csdn.net/blogstar2021";//blogstar2021 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
blogStars = new ArrayList();
//稍等页面渲染完成
Thread.sleep(3000);
List lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
for (int i = 0; i < lis.size(); i++) {
WebElement element = lis.get(i);
element.click();
Thread.sleep(2000);
List boxs = driver.findElements(By.className("scoreitem"));
for (int j = 0; j < boxs.size(); j++) {
WebElement box = boxs.get(j);
BlogStar blogStar = new BlogStar();
//领域
blogStar.field = element.getText();
//博主简称
blogStar.name = box.findElement(By.className("name")).getText();
List dts = box.findElements(By.tagName("dt"));
//排名
blogStar.ranking = dts.get(0).getText();
//分数
blogStar.score = StringUtil.getInts(dts.get(1).getText())[0];
//评分页
blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href");
//blogUrl
blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href");
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
blogStars.add(blogStar);
}
lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
}
driver.close();
ArrayList heads = new ArrayList();
heads.add("领域");
heads.add("博主简称");
heads.add("领域排名");
heads.add("总评分");
heads.add("参赛互动页");
heads.add("博主首页");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);
System.out.println("Creating excel");
try {
File file = new File(outPutPath + filename + suffix);
inputStream = new FileInputStream(file);
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
sheet.setColumnWidth(0, 16 * 256);
sheet.setColumnWidth(1, 20 * 256);
sheet.setColumnWidth(2, 10 * 256);
sheet.setColumnWidth(3, 10 * 256);
sheet.setColumnWidth(4, 20 * 256);
sheet.setColumnWidth(5, 20 * 256);
sheet.setColumnWidth(6, 25 * 256);
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogStars) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.field);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.ranking);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.score);
cell = row.createCell(colNum++);
CreationHelper createHelper = workbook.getCreationHelper();
XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link.setAddress(blogStar.scorePage);
cell.setHyperlink(link);
cell.setCellValue(blogStar.scorePage);
cell = row.createCell(colNum++);
XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link2.setAddress(blogStar.blogUrl);
cell.setHyperlink(link2);
cell.setCellValue(blogStar.blogUrl);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(file);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
inputStream.close();
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
博主博客文章统计|样例程序
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class BlogArticleStatistics {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "u014132947";
private static String sheetname = "article_" + filename;
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList blogArticles;
private static String url = "https://blog.csdn.net/u014132947";//博主url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//
//
//单条文章数据DOM结构
//
//
//
// data-report-click="{"spm":"1001.2014.3001.5190"}" target="_blank">
// 原创
// 获取世界人口排名2021
//
//
//
// 获取世界人口排名2021,Linux配置Selenium+Chrome+Java实现自动化测试
//
//
//
// 2021-12-26 06:16:59
// 105
//
//
//
//
// 编辑
//
//
//
//
blogArticles = new ArrayList();
//稍等页面渲染完成
Thread.sleep(2000);
//nextElement
WebElement nextElement = driver.findElement(By.className("js-page-next"));
int dataNum = Integer.valueOf(driver.findElement(By.id("container-header-blog")).getAttribute("data-num"));
while (nextElement != null && blogArticles.size() < dataNum) {
List search_results = driver.findElements(By.className("article-item-box"));
for (int i = 0; i < search_results.size(); i++) {
WebElement element = search_results.get(i);
Article article = new Article();
//文章标题
article.title = element.findElement(By.tagName("a")).getText();
//简要内容
article.content = element.findElement(By.className("content")).getText();
//发布时间
article.publishTime = element.findElement(By.className("date")).getText();
//访问数
article.readNum = StringUtil.getInts(element.findElement(By.className("read-num")).getText())[0];
blogArticles.add(article);
}
nextElement.click();
//稍等页面渲染完成
Thread.sleep(3000);
nextElement = driver.findElement(By.className("js-page-next"));
}
driver.close();
ArrayList heads = new ArrayList();
heads.add("文章标题");
heads.add("简要内容");
heads.add("发布时间");
heads.add("访问数");
//CSVUtils.createCSVFile(heads, blogArticles, outPutPath, filename);
System.out.println("Creating excel");
try {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
for (int i = 0; i < heads.size(); i++) {
if (i == 3) {
sheet.setColumnWidth(i, 6 * 256);
} else {
sheet.setColumnWidth(i, 15 * 256);
}
}
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (Article article : blogArticles) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(article.title);
cell = row.createCell(colNum++);
cell.setCellValue(article.content);
cell = row.createCell(colNum++);
cell.setCellValue(article.publishTime);
cell = row.createCell(colNum++);
cell.setCellValue(article.readNum);
colNum = 0;
}
outputStream = new FileOutputStream(outPutPath + filename + suffix);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
作于2021年 12月 27日 星期一 04:02:17 CST,归档于2021年 12月 27日 星期一 20:48:42 CST。