下面的工具类是解决问题的关键,以爬虫爬取某网站数据为例
工具类package nuc.zy.edu.utils;
import javax.net.ssl.*;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
public class SslUtils {
public static void trustAllHttpsCertificates() throws Exception {
TrustManager[] trustAllCerts = new TrustManager[1];
TrustManager tm = new miTM();
trustAllCerts[0] = tm;
SSLContext sc = SSLContext.getInstance("SSL");
sc.init(null, trustAllCerts, null);
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
}
static class miTM implements TrustManager, X509TrustManager {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(X509Certificate[] certs, String authType)
throws CertificateException {
return;
}
public void checkClientTrusted(X509Certificate[] certs, String authType)
throws CertificateException {
return;
}
}
public static void ignoreSsl() throws Exception{
HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
return true;
}
};
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
}
}
案例
依赖此处以使用jsoup爬取某个https开头的网站时(使用了ssl证书的网站)为例
采用ES技术将爬取的数据存放在ES中
在爬取之前使用工具类
逻辑忽略证书SslUtils.ignoreSsl()
代码4.0.0 org.springframework.boot spring-boot-starter-parent 2.3.5.RELEASE nuc.zy.edu es-jd 0.0.1-SNAPSHOT es-jd Demo project for Spring Boot 1.8 7.8.0 org.jsoup jsoup 1.10.2 org.springframework.boot spring-boot-starter-data-elasticsearch org.springframework.boot spring-boot-starter-thymeleaf org.springframework.boot spring-boot-starter-web com.alibaba fastjson 1.2.62 org.springframework.boot spring-boot-devtools runtime true org.springframework.boot spring-boot-configuration-processor true org.projectlombok lombok true org.springframework.boot spring-boot-starter-test test org.springframework.boot spring-boot-maven-plugin org.projectlombok lombok
package nuc.zy.edu.utils;
import nuc.zy.edu.entity.Goods;
import org.jsoup.Jsoup;
import org.jsoup.nodes.document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class HtmlParseUtil {
public static List parseJD(String keywords) throws Exception {
// 逻辑忽略证书
SslUtils.ignoreSsl();
//前提 需要联网 不能获取ajax数据
String url = "https://search.jd.com/Search?keyword=" + keywords ;
//解析网页 document就是游览器document页面对象
document document = Jsoup.parse(new URL(url), 30000);
//所有在js中可以使用的方法 这里都可以使用
Element element = document.getElementById("J_goodsList");
List goods = new ArrayList<>() ;
Elements elements = element.getElementsByTag("li");
for (Element e : elements) {
String img = e.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = e.getElementsByClass("p-price").eq(0).text();
String name = e.getElementsByClass("p-name").eq(0).text();
Goods good = new Goods() ;
good.setImg(img);
good.setName(name);
good.setPrice(price);
goods.add(good) ;
}
System.out.println(goods);
return goods ;
}
}
package nuc.zy.edu.entity;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.data.annotation.Id;
import org.springframework.data.elasticsearch.annotations.document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;
@Data
@AllArgsConstructor
@NoArgsConstructor
//主分片是3 副本是1
@document(indexName = "goods",shards = 3,replicas = 1)
public class Goods {
@Id
private String id ;
@Field(type = FieldType.Keyword)
private String name ;
@Field(type = FieldType.Text)
private String img ;
@Field(type = FieldType.Text)
private String price ;
// private Double price ;
}



