栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Java

springboot+jsoup抓取新闻网站信息

Java 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

springboot+jsoup抓取新闻网站信息

springboot+jsoup抓取新闻网站信息
            • 步骤:
            • 一、导入jar包
            • 二、解析凤凰网新闻
            • jsoup获取动态生成的js内容
            • service
            • serviceImpl
            • mapper
            • domian

步骤:

(1)根据url抓取html页面
(2)对html页面进行解析,获取该页面所有的数据,保存到数据库中(mongodb)
(3)遍历所有的数据,更新详情数据

一、导入jar包
 
        
            org.apache.httpcomponents
            httpclient
            4.5.2
        
        
        
            net.sourceforge.htmlunit
            htmlunit
            2.33
        
        
            net.sf.json-lib
            json-lib
            2.4
            jdk15
        
        
        com.googlecode.juniversalchardet
            juniversalchardet
            1.0.3
        
        
            org.jsoup
            jsoup
            1.10.3
        
        
            org.springframework.boot
            spring-boot-starter-data-mongodb
        
        
            org.mongodb
            mongo-java-driver
            3.0.4
        
二、解析凤凰网新闻
  
    @GetMapping("/saveNewStream")
    public void saveNewStream(String url) throws IOException {
        log.info("程序开始...");
        long startTime = new Date().getTime();
        //1.获取动态js页面内容
        document document = HtmlUtils.getHtmlunit(url);
        //2.获取script里html内容
        String html = document.getElementsByTag("script").get(2).html();
        html = html.replace("//","");
        String[] data = html.split("var");
        String sp = "allData =";
        //3.获取json数据
        List newsStreamArrayList = new ArrayList<>();
        for(String variable : data){
            if (variable.contains(sp)){
                variable = variable.replace(sp, "").trim();
                variable = variable.substring(0, variable.length()-1);
                JSONObject jsonObject = JSONObject.parseObject(variable);
                //4.获取所需新闻列表
                JSONArray newsstream = jsonObject.getJSONArray("newsstream");
                // jsonArray转换为List对象
                List newsStreams = JSONArray.parseArray(newsstream.toString(), NewsStream.class);
                newsStreamArrayList.addAll(newsStreams);
                //5.递归求下页数据
                List listThree = HtmlUtils.buildTree(newsStreams);
                newsStreamArrayList.addAll(listThree);
                //6.保存数据
                int i = newsStreamService.saveNewsStream(newsStreamArrayList);
            }
        }
        //获取新闻详情数据
        newsStreamArrayList.forEach(n->{
            NewsStream newsStream = new NewsStream();
            String  articleUrl = n.getUrl();
            newsStream.setId(n.getId());
            document doc = null;
            //3.获取动态js页面内容
            try {
                String htmlInfo = Requests.get(articleUrl);
                doc = Jsoup.parse(htmlInfo);
            }catch (MalformedURLException e){
                e.printStackTrace();
            }
            //详情列表 标题+内容+图片
            Elements elements = doc.select("div[id=root]").select("div[class=artical-25JfwmD5]").select("div[class=artical-25JfwmD5]");
            //来源时间
            Elements span = elements.select("div[class=info-3Ht6Fk1n clearfix]").select("span");
            //新闻发布时间
            String time = span.first().text();
            newsStream.setNewsTime(time);
            //来源
            String source = span.select("a[href]").text();
            newsStream.setSource(source);
            //内容+图片
            Elements contentImg = elements.select("div[class=main_content-r5RGqegj]").select("div[class=text-3w2e3DBc]");
            //内容
            String content = contentImg.text();
            newsStream.setContent(content);
            //图片
            Elements p = contentImg.select("p");
            List list = new ArrayList();
            int i = 0;
            Elements img = p.select("[src]");
            for (Element element : img) {
                String srcUrl = element.attr("src");//获取到src的值
                list.add(i,srcUrl);
                i++;
            }
            newsStream.setImages(list);
            //根据id修改mongodb数据库信息
            newsStreamService.updateNewsStream(newsStream);
        });
        long endTime = new Date().getTime();
        log.info("********本程序运行 " + (endTime - startTime) + " 毫秒完成***********");
    }
jsoup获取动态生成的js内容
@Slf4j
public class HtmlUtils {

    
    public static document getHtmlunit(String url) throws IOException {
        //新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
        final WebClient webClient = new WebClient(BrowserVersion.CHROME);
        //当JS执行出错的时候是否抛出异常, 这里选择不需要
        webClient.getOptions().setThrowExceptionOnscriptError(false);
        //当HTTP的状态非200时是否抛出异常, 这里选择不需要
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setActiveXNative(false);
        //是否启用CSS, 因为不需要展现页面, 所以不需要启用
        webClient.getOptions().setCssEnabled(false);
        //很重要,启用JS
        webClient.getOptions().setJavascriptEnabled(true);
        //很重要,设置支持AJAX
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
        HtmlPage page = null;
        try {
            //尝试加载上面图片例子给出的网页
            page = webClient.getPage(url);
        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            webClient.close();
        }
        //异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
        webClient.waitForBackgroundJavascript(30000);
        //直接将加载完成的页面转换成xml格式的字符串
        String pageXml = page.asXml();
        // 下面的代码就是对字符串的操作了,常规的爬虫操作,用到了比较好用的Jsoup库
        document doc = Jsoup.parse(pageXml);//获取html文档
        return doc;
    }

    //下拉加载请求的url
    private static String getViewUrl = "http://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/";
    //请求参数
    private static String callback = "getColumnInfoCallback";

    
    public static String getViewMore(String id,String nesTime,Long nowTime) throws MalformedURLException {
        StringBuffer viewUrl = new StringBuffer();
        viewUrl.append(getViewUrl).append(id).append("/").append(nesTime).append("/20/14-35083-/").append(callback).append(
                "?callback=").append(callback).append("&_").append(nowTime);
        String viewUrlInfo = Requests.get(viewUrl.toString());
        return viewUrlInfo;
    }

    
    public static List buildTree(List list){
        List newsStreamList = new ArrayList<>();
        list.forEach(n->{
            if (n.equals(list.get(list.size() - 1))) {
                String viewMore = null;
                //加载全部列表
                //拼接请求url:需要的新闻发布时间时间戳
                String nesTime = DateUtils.date2TimeStamp(n.getNewsTime(), "yyyy-MM-dd HH:mm:ss");
                try {
                    viewMore = HtmlUtils.getViewMore(n.getId(), nesTime, new Date().getTime());
                    viewMore = viewMore.replace("getColumnInfoCallback(", "").trim();
                    viewMore = viewMore.substring(0, viewMore.length()-1);
                } catch (MalformedURLException e) {
                    e.printStackTrace();
                }
                //加载view转json
                JSONObject view = JSONObject.parseObject(viewMore);
                //取data信息
                String image = view.getString("data");
                JSONObject object = JSONObject.parseObject(image);
                String newsstreamList = object.getString("newsstream");
                JSONArray jsonArrayNewsstreamList = JSONArray.parseArray(newsstreamList);
                // jsonArray转换为List对象
                List newsStreams = JSONArray.parseArray(jsonArrayNewsstreamList.toString(), NewsStream.class);
                newsStreamList.addAll(newsStreams);
                //递归寻找下一list
                List streams = buildTree(newsStreams);
                newsStreamList.addAll(streams);
            }
        });
        return newsStreamList;
    }

service
package com.ddtj.crawl.service;
import com.ddtj.crawl.domain.NewsStream;
import org.springframework.data.mongodb.core.query.Query;
import java.util.List;
public interface NewsStreamService {

    
    public int saveNewsStream(List newsStream);

    
    public void updateNewsStream(NewsStream newsStream);

}
serviceImpl
package com.ddtj.crawl.service.impl;
import com.ddtj.crawl.domain.NewsStream;
import com.ddtj.crawl.service.NewsStreamService;
import com.ddtj.crawl.utils.MongoUtil;
import com.ddtj.crawl.utils.PageHelper;
import com.mongodb.client.result.UpdateResult;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.*;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.data.mongodb.core.query.Update;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
@Service
@Slf4j
public class NewsStreamServiceImpl implements NewsStreamService {

    @Autowired
    MongoTemplate mongoTemplate;

    
    @Override
    public int saveNewsStream(List newsStream) {
        log.info("mongodb 数据库插入: "+newsStream.size()+"条数据.........");
        newsStream.forEach(n->{
            //使用 save和insert都可以进行插入
            //区别:当存在"_id"时
            //insert 插入已经存在的id时 会异常
            //save 则会进行更新
            //简单来说 save 就是不存在插入 存在更新
//            NewsStream insert = mongoTemplate.insert(n);
            mongoTemplate.save(n);
        });
        return 0;
    }

    
    @Override
    public void updateNewsStream(NewsStream newsStream) {
        Query query = new Query(Criteria.where("id").is(newsStream.getId()));
        Update update = new Update();
        update.set("source", newsStream.getSource());
        update.set("content", newsStream.getContent());
        update.set("images", newsStream.getImages());
        UpdateResult result = mongoTemplate.updateFirst(query, update, NewsStream.class);
    }

    
    @Override
    public List getNewsStreamList(NewsStream newsStream,Query query ) {
        //新闻标题模糊
        if (null != newsStream.getTitle()){
            Pattern patternTitle = Pattern.compile("^.*"+newsStream.getTitle()+".*$", Pattern.CASE_INSENSITIVE);
            query.addCriteria(Criteria.where("title").regex(patternTitle));
        }
        //新闻来源模糊
        if (null != newsStream.getSource()){
            Pattern patternSource = Pattern.compile("^.*"+newsStream.getSource()+".*$", Pattern.CASE_INSENSITIVE);
            query.addCriteria(Criteria.where("source").regex(patternSource));
        }
        //新闻url模糊
        if (null != newsStream.getUrl()){
            Pattern patternUrl = Pattern.compile("^.*"+newsStream.getUrl()+".*$", Pattern.CASE_INSENSITIVE);
            query.addCriteria(Criteria.where("url").regex(patternUrl));
        }
        //时间倒序
        query.with(Sort.by(
                    Sort.Order.desc("newsTime")
            ));
        return mongoTemplate.find(query, NewsStream.class);
    }

}
mapper
public interface NewsStreamMapper extends MongoRepository {

}
domian
package com.ddtj.crawl.domain;
import com.ddtj.common.core.annotation.Excel;
import com.ddtj.common.core.web.domain.baseEntity;
import groovy.transform.builder.Builder;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.mongodb.core.mapping.document;
import java.util.List;
@document(collection="news_stream")//集合名
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class NewsStream extends baseEntity{

    //id
    @Excel(name = "新闻ID")
    private String id;

    //标题
    @Excel(name = "标题")
    private String title;

    //新闻时间
    @Excel(name = "新闻时间")
    private String newsTime;

    //来源
    @Excel(name = "来源")
    private String source;

    //详情内容
    @Excel(name = "详情内容")
    private String content;

    //原始缩略图
    @Excel(name = "缩略图")
    private String thumbnails;

    //文章详情图片
    @Excel(name = "文章详情图片")
    private List images;

    //文章详情url
    @Excel(name = "详情URL")
    private String url;


}
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/271755.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号