集成jsoup
org.jsoup
jsoup
1.10.2
解析html页面
@Component
public class HtmlParseUtil {
//https://search.jd.com/Search?keyword=java 网站
// public static void main(String[] args) throws Exception {
// new HtmlParseUtil().parseJd("java").forEach(System.out ::println);
// }
public List parseJd(String keywords) throws Exception {
String url="https://search.jd.com/Search?keyword="+keywords;
// 解析网页获取 document
document document = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 5.1; zh-CN) AppleWebKit/535.12 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/535.12").timeout(30000).get();
// System.out.println(document);
// 找到商品位置
Element element = document.getElementById("J_goodsList");
// 解析li标签
Elements li = element.getElementsByTag("li");
ArrayList contentList = new ArrayList();
for (int i = 0; i < li.size(); i++) {
// 图片
String img = li.get(i).getElementsByTag("img").attr("data-lazy-img");
// 价格
String price = Arrays.asList(li.get(i).getElementsByClass("p-price").eq(0).text().split("¥")).get(1);
// 标题
String title = li.get(i).getElementsByClass("p-name").eq(0).text();
// System.out.println("----------------------");
// System.out.println(img);
// System.out.println(price);
// System.out.println(title);
Content content = new Content();
content.setTitle(title);
content.setImg(img);
content.setPrice(price);
contentList.add(content);
}
return contentList;
}
}
实体类
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Content {
private String title;
private String img;
private String price;
}
controller层
@RestController
public class ContentController {
@Autowired
private ContentService contentService;
@GetMapping("/parse/{keyword}")
public Boolean parse(@PathVariable("keyword") String keyword) throws Exception {
return contentService.parseContent(keyword);
}
@GetMapping("/parse/{keyword}/{pageNo}/{pageSize}")
public List
业务层
@Service
public class ContentService {
@Autowired
private RestHighLevelClient restHighLevelClient;
public boolean parseContent(String keywords) throws Exception {
List contentList = new HtmlParseUtil().parseJd(keywords);
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");
for (int i=0;i < contentList.size(); i++){
bulkRequest.add(
new IndexRequest("jd_goods")
.source(JSON.toJSONString(contentList.get(i)), XContentType.JSON));
}
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return bulkResponse.hasFailures();
}
// 获取数据实现查询
public List> searchContent(String keyword,int pageNo,int pageSize) throws IOException {
if (pageNo < 1) {
pageNo = 1;
}
//条件搜索
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//分页
searchSourceBuilder.from(pageNo);
searchSourceBuilder.size(pageSize);
//放入条件
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title",keyword);
searchSourceBuilder.query(termQueryBuilder);
searchSourceBuilder.timeout(new Timevalue(60,TimeUnit.SECONDS));
// 执行搜索
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest,RequestOptions.DEFAULT);
//解析结果
List> list = new ArrayList<>();
for (SearchHit searchHit : searchResponse.getHits()) {
list.add(searchHit.getSourceAsMap());
}
return list;
}
}
最终实现结果: