栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Java

《2021博客之星年度总评选》数据采集样例程序

Java 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

《2021博客之星年度总评选》数据采集样例程序

《2021博客之星年度总评选》数据采集Java样例程序

文章目录
  • pom.xml
  • 2020线上投票博客之星数据采集|样例程序
    • 采集样例
  • 2020投票贡献排行榜数据采集|样例程序
    • 采集样例
  • 2021线上评分TOP90数据采集|样例程序
    • 采集样例
  • 博主博客文章统计|样例程序
    • 采集样例


pom.xml


    4.0.0

    1
    _psimplemvn
    1.0-SNAPSHOT

    
        
            
                org.apache.maven.plugins
                maven-compiler-plugin
                
                    8
                    8
                
            
        
    

    
        
            org.seleniumhq.selenium
            selenium-chrome-driver
            4.0.0
        
        
            org.apache.poi
            poi-ooxml
            3.17
        
    



2020线上投票博客之星数据采集|样例程序
package simple.call.blogstar;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


public class BlogStarStatisticsTest {

    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    private static String filename = "blog_star2020";
    private static String sheetname = filename.toUpperCase();
    private static String suffix = ".xlsx";
    private static FileOutputStream outputStream;
    private static ArrayList blogStars;
    private static String url = "https://bss.csdn.net/m/topic/blog_star2020";//blog_star2020 url

    //测试用例
    public static void main(String[] args) throws InterruptedException {


        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        //
    //单条数据DOM结构 //
  • // // 001 // // // // ✎ℳ๓₯㎕...雲淡風輕 // 码龄6年 // //

    2020年度原创博文:77 篇

    //

    当前票数: 392

    // // // 投TA一票 // // 为TA拉票 // // //
  • //
      blogStars = new ArrayList(); //稍等页面渲染完成 Thread.sleep(3000); List search_results = driver.findElements(By.xpath("/ package simple.call.blogstar; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.openqa.selenium.By; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import simple.call.util.StringUtil; import simple.call.util.TimeUtil; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class BlogStarStatisticsVoteLeaderboardList { private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/"; private static String filename = "aa518189"; private static String sheetname = filename; private static String suffix = ".xlsx"; private static FileOutputStream outputStream; private static ArrayList blogVotes; private static String url = "https://bss.csdn.net/m/topic/blog_star2020/detail?username=aa518189"; //测试用例 public static void main(String[] args) throws InterruptedException { //预先设置驱动 System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver"); // Initialize your driver as you normally would: ChromeDriver driver = new ChromeDriver(); driver.get(url); //
        //单条数据DOM结构 //
      • // // 1 // swagLi // // // // // 码龄4年 // 36票 // //
      • //
          blogVotes = new ArrayList(); //稍等页面渲染完成 Thread.sleep(2000); List search_results = driver.findElements(By.xpath("/ package simple.call.blogstar; import org.apache.poi.common.usermodel.HyperlinkType; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.CreationHelper; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.xssf.usermodel.XSSFHyperlink; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.openqa.selenium.By; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import simple.call.util.StringUtil; import simple.call.util.TimeUtil; import java.io.*; import java.util.ArrayList; import java.util.Date; import java.util.List; public class Blogstar2021 { private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/"; //预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException private static String filename = "blogstar2021"; private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss"); private static String suffix = ".xlsx"; //用于保留Excel中的原内容 private static FileInputStream inputStream; //用于往Excel中追加写入新内容 private static FileOutputStream outputStream; private static ArrayList blogStars; private static String url = "https://www.csdn.net/blogstar2021";//blogstar2021 url //测试用例 public static void main(String[] args) throws InterruptedException { //预先设置驱动 System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver"); // Initialize your driver as you normally would: ChromeDriver driver = new ChromeDriver(); driver.get(url); blogStars = new ArrayList(); //稍等页面渲染完成 Thread.sleep(3000); List lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li")); for (int i = 0; i < lis.size(); i++) { WebElement element = lis.get(i); element.click(); Thread.sleep(2000); List boxs = driver.findElements(By.className("scoreitem")); for (int j = 0; j < boxs.size(); j++) { WebElement box = boxs.get(j); BlogStar blogStar = new BlogStar(); //领域 blogStar.field = element.getText(); //博主简称 blogStar.name = box.findElement(By.className("name")).getText(); List dts = box.findElements(By.tagName("dt")); //排名 blogStar.ranking = dts.get(0).getText(); //分数 blogStar.score = StringUtil.getInts(dts.get(1).getText())[0]; //评分页 blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href"); //blogUrl blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href"); //录入时间 blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss"); blogStars.add(blogStar); } lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li")); } driver.close(); ArrayList heads = new ArrayList(); heads.add("领域"); heads.add("博主简称"); heads.add("领域排名"); heads.add("总评分"); heads.add("参赛互动页"); heads.add("博主首页"); heads.add("录入时间"); //CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename); System.out.println("Creating excel"); try { File file = new File(outPutPath + filename + suffix); inputStream = new FileInputStream(file); XSSFWorkbook workbook = new XSSFWorkbook(inputStream); XSSFSheet sheet = workbook.createSheet(sheetname); //设置列宽 sheet.setColumnWidth(0, 16 * 256); sheet.setColumnWidth(1, 20 * 256); sheet.setColumnWidth(2, 10 * 256); sheet.setColumnWidth(3, 10 * 256); sheet.setColumnWidth(4, 20 * 256); sheet.setColumnWidth(5, 20 * 256); sheet.setColumnWidth(6, 25 * 256); Row row = null; Cell cell = null; //插入第一行数据的表头 //创建第一行 row = sheet.createRow(0); for (int i = 0; i < heads.size(); i++) { cell = row.createCell(i); cell.setCellValue(heads.get(i)); } int rowNum = 1; int colNum = 0; //组合表格:行、列 for (BlogStar blogStar : blogStars) { row = sheet.createRow(rowNum++); cell = row.createCell(colNum++); cell.setCellValue(blogStar.field); cell = row.createCell(colNum++); cell.setCellValue(blogStar.name); cell = row.createCell(colNum++); cell.setCellValue(blogStar.ranking); cell = row.createCell(colNum++); cell.setCellValue(blogStar.score); cell = row.createCell(colNum++); CreationHelper createHelper = workbook.getCreationHelper(); XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL); link.setAddress(blogStar.scorePage); cell.setHyperlink(link); cell.setCellValue(blogStar.scorePage); cell = row.createCell(colNum++); XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL); link2.setAddress(blogStar.blogUrl); cell.setHyperlink(link2); cell.setCellValue(blogStar.blogUrl); cell = row.createCell(colNum++); cell.setCellValue(blogStar.createTime); colNum = 0; } outputStream = new FileOutputStream(file); //写入数据到Excel workbook.write(outputStream); //关闭流 inputStream.close(); outputStream.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } System.out.println("Done"); } }
采集样例
博主博客文章统计|样例程序
package simple.call.blogstar;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


public class BlogArticleStatistics {
    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    private static String filename = "u014132947";
    private static String sheetname = "article_" + filename;
    private static String suffix = ".xlsx";
    private static FileOutputStream outputStream;
    private static ArrayList
blogArticles; private static String url = "https://blog.csdn.net/u014132947";//博主url //测试用例 public static void main(String[] args) throws InterruptedException { //预先设置驱动 System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver"); // Initialize your driver as you normally would: ChromeDriver driver = new ChromeDriver(); driver.get(url); // // //单条文章数据DOM结构 // //

// // data-report-click="{"spm":"1001.2014.3001.5190"}" target="_blank"> // 原创 // 获取世界人口排名2021 // // //

// 获取世界人口排名2021,Linux配置Selenium+Chrome+Java实现自动化测试 //

// //

// 2021-12-26 06:16:59 // 105 //

// // // // 编辑 // // // // blogArticles = new ArrayList
(); //稍等页面渲染完成 Thread.sleep(2000); //nextElement WebElement nextElement = driver.findElement(By.className("js-page-next")); int dataNum = Integer.valueOf(driver.findElement(By.id("container-header-blog")).getAttribute("data-num")); while (nextElement != null && blogArticles.size() < dataNum) { List search_results = driver.findElements(By.className("article-item-box")); for (int i = 0; i < search_results.size(); i++) { WebElement element = search_results.get(i); Article article = new Article(); //文章标题 article.title = element.findElement(By.tagName("a")).getText(); //简要内容 article.content = element.findElement(By.className("content")).getText(); //发布时间 article.publishTime = element.findElement(By.className("date")).getText(); //访问数 article.readNum = StringUtil.getInts(element.findElement(By.className("read-num")).getText())[0]; blogArticles.add(article); } nextElement.click(); //稍等页面渲染完成 Thread.sleep(3000); nextElement = driver.findElement(By.className("js-page-next")); } driver.close(); ArrayList heads = new ArrayList(); heads.add("文章标题"); heads.add("简要内容"); heads.add("发布时间"); heads.add("访问数"); //CSVUtils.createCSVFile(heads, blogArticles, outPutPath, filename); System.out.println("Creating excel"); try { XSSFWorkbook workbook = new XSSFWorkbook(); XSSFSheet sheet = workbook.createSheet(sheetname); //设置列宽 for (int i = 0; i < heads.size(); i++) { if (i == 3) { sheet.setColumnWidth(i, 6 * 256); } else { sheet.setColumnWidth(i, 15 * 256); } } Row row = null; Cell cell = null; //插入第一行数据的表头 //创建第一行 row = sheet.createRow(0); for (int i = 0; i < heads.size(); i++) { cell = row.createCell(i); cell.setCellValue(heads.get(i)); } int rowNum = 1; int colNum = 0; //组合表格:行、列 for (Article article : blogArticles) { row = sheet.createRow(rowNum++); cell = row.createCell(colNum++); cell.setCellValue(article.title); cell = row.createCell(colNum++); cell.setCellValue(article.content); cell = row.createCell(colNum++); cell.setCellValue(article.publishTime); cell = row.createCell(colNum++); cell.setCellValue(article.readNum); colNum = 0; } outputStream = new FileOutputStream(outPutPath + filename + suffix); //写入数据到Excel workbook.write(outputStream); //关闭流 outputStream.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } System.out.println("Done"); } }

采集样例

作于2021年 12月 27日 星期一 04:02:17 CST,归档于2021年 12月 27日 星期一 20:48:42 CST。

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/690933.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号