目录
家园(Homestead)
请求配置封装
请求工具封装
响应封装
前两篇已经将基本环境和基础工具维护好了,本章开始要来点干货了
家园(Homestead)
爬虫的本质还是“请求”,我想在座的应该都同意,但是网络常见请求无非就那几种,Socket请求、Http请求、FTP请求……,我们要做web页面的爬虫,那肯定是需要一个Http请求工具了。
最前面我们提到Apache的HttpClient工具,其实网络上这个工具的使用教程应该是最多的,JDK实际上一直也有相关的Http请求工具,但是一直都被诟病,直到……直到JDK11的出现,也预示着JDK的Http请求工具(HttpClient)基本接近完善(方便好用),所以才有了我们这一篇文章(要不然我也用Apache了……)。
要实现JDK的HttpClient封装,我们得明确封装目标:
1.请求配置封装
2.请求工具封装
3.响应封装
其实在诸多Http请求工具中,无非就这三点封装罢了,所以这里也不装了,直接上代码。
请求配置封装
封装请求配置的目的是为了尽可能简化使用时的复杂度,因此我们要明确哪些是必要参数,哪些是非必要参数,哪些参数需要有默认值,有时候我们为了满足多种情况,所以会出现重载、重写的情况,甚至有些功能我们可能暂时用不到,但是以后可能用得到,所以要封装就不要怕麻烦,要封装就要越仔细,为了统一我们的配置初始化问题,因此我们的HttpConfig就出现了,为了更进一步简化工具的初始化,我们HttpConfig采用构造器模式来构建Http请求工具。
HttpConfig.java
package com.vtarj.pythagoras.explore;
import javax.net.ssl.SSLContext;
import java.net.*;
import java.net.http.HttpClient;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.StringJoiner;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
public class HttpConfig {
private HttpClient.Version version;
private HttpClient.Redirect redirect;
private Duration connectTimeout;
private Executor executor;
private Authenticator authenticator;
private ProxySelector proxySelector;
private CookieHandler cookieHandler;
private SSLContext sslContext;
private int priority;
private Map headerMap;
private static final String DEFAULT_CONTEXT_TYPE = "application/json";
private Map requestParams;
private String requestMethod;
private URI requestURI;
private Charset reqCode = StandardCharsets.UTF_8;
private Charset resCode = StandardCharsets.UTF_8;
private boolean locked = true;
public HttpConfig() {
version = HttpClient.Version.HTTP_2;
redirect = HttpClient.Redirect.NORMAL;
connectTimeout = Duration.ofMinutes(2);
setHeader("Content-Type",DEFAULT_CONTEXT_TYPE);
setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55");
requestMethod = "GET";
priority = 1;
}
public HttpClient.Version getVersion() {
return version;
}
public HttpConfig setVersion(HttpClient.Version version) {
this.version = version;
return this;
}
public HttpClient.Redirect getRedirect() {
return redirect;
}
public HttpConfig setRedirect(HttpClient.Redirect redirect) {
this.redirect = redirect;
return this;
}
public Duration getConnectTimeout() {
return connectTimeout;
}
public HttpConfig setConnectTimeout(Duration connectTimeout) {
this.connectTimeout = connectTimeout;
return this;
}
public Executor getExecutor() {
if(executor == null){
this.executor = Executors.newFixedThreadPool(5);
}
return executor;
}
public HttpConfig setExecutor(Executor executor) {
this.executor = executor;
return this;
}
public Authenticator getAuthenticator() {
return authenticator;
}
public HttpConfig setAuthenticator(Authenticator authenticator) {
this.authenticator = authenticator;
return this;
}
public ProxySelector getProxySelector() {
return proxySelector;
}
public HttpConfig setProxySelector(ProxySelector proxySelector) {
this.proxySelector = proxySelector;
return this;
}
public CookieHandler getCookieHandler() {
return cookieHandler;
}
public HttpConfig setCookieHandler(CookieHandler cookieHandler) {
this.cookieHandler = cookieHandler;
return this;
}
public SSLContext getSslContext() {
return sslContext;
}
public HttpConfig setSslContext(SSLContext sslContext) {
this.sslContext = sslContext;
return this;
}
public int getPriority() {
return priority;
}
public HttpConfig setPriority(int priority) {
if(priority < 1 || priority > 256){
throw new RuntimeException("您输入的优先级不合法,优先级范围[1~256]");
}
this.priority = priority;
return this;
}
public Map getHeaderMap() {
return headerMap;
}
public HttpConfig setHeader(String key,String value){
if (headerMap == null) {
headerMap = new HashMap<>();
}
headerMap.put(key,value);
return this;
}
public HttpConfig setHeaderMap(Map headerMap) {
this.headerMap.putAll(headerMap);
return this;
}
public Map getRequestParams() {
return requestParams;
}
public HttpConfig setRequestParam(String key,String value) {
if (requestParams == null){
requestParams = new HashMap<>();
}
requestParams.put(key,value);
return this;
}
public HttpConfig setRequestParams(Map requestParams) {
this.requestParams = requestParams;
return this;
}
public String getRequestMethod() {
return requestMethod;
}
public HttpConfig setRequestMethod(String requestMethod) {
String[] methods = {"GET","POST","PUT","DELETE"};
if(!Arrays.asList(methods).contains(requestMethod.toUpperCase())){
throw new RuntimeException("请求方法设置错误,不符合规范要求");
}
this.requestMethod = requestMethod.toUpperCase();
return this;
}
public URI getRequestURI() {
return requestURI;
}
public HttpConfig setRequestURI(String requestURI) {
this.requestURI = formatURI(requestURI);
return this;
}
private URI formatURI(String uri) {
if (!uri.toLowerCase().startsWith("http://") && !uri.toLowerCase().startsWith("https://")){
uri = "http://" + uri;
}
try {
return new URI(uri);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
public Charset getReqCode() {
return reqCode;
}
public HttpConfig setReqCode(Charset reqCode) {
this.reqCode = reqCode;
return this;
}
public Charset getResCode() {
return resCode;
}
public HttpConfig setResCode(Charset resCode) {
this.resCode = resCode;
return this;
}
public boolean isLocked() {
return locked;
}
public HttpConfig setLocked(boolean locked) {
this.locked = locked;
return this;
}
public HttpExplore build(){
//针对Get方法,组装传递参数
String paramsStr = paramsToString();
if (paramsStr != null && requestMethod.equals("GET")){
if (requestURI.toString().indexOf("?") > 0) {
setRequestURI(requestURI + "&" + paramsStr);
} else {
setRequestURI(requestURI + "?" + paramsStr);
}
}
return new HttpExplore(this);
}
protected String paramsToString(){
StringJoiner sj = new StringJoiner("&");
if (requestParams != null && requestParams.size() > 0){
requestParams.forEach((k,v) -> sj.add(k + "=" + v.toString()));
return sj.toString();
}
return null;
}
}
请求工具封装
配置有了,现在就开始封装工具,工具我们就封装几个常用的方案即可,如:GET请求、POST请求、PUT请求、DELETE请求等,请求结果我们以字符串形式获取和以文件形式获取两种即可(以后如果有需要,请自行添加)
HttpExplore.java
package com.vtarj.pythagoras.explore;
import java.io.File;
import java.io.IOException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Path;
import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
public class HttpExplore {
//统一通道管理,节省资源
private static HttpClient client;
private final HttpConfig config;
public HttpExplore(HttpConfig config){
this.config = config;
//单例模式
if (client == null || !config.isLocked()){
synchronized (HttpExplore.class){
//设置构造者必须参数
HttpClient.Builder builder = HttpClient.newBuilder();
builder.version(config.getVersion());
builder.followRedirects(config.getRedirect());
builder.connectTimeout(config.getConnectTimeout());
builder.priority(config.getPriority());
//设置构造者非必须参数
Optional.ofNullable(config.getExecutor()).ifPresent(builder::executor);
Optional.ofNullable(config.getAuthenticator()).ifPresent(builder::authenticator);
Optional.ofNullable(config.getCookieHandler()).ifPresent(builder::cookieHandler);
Optional.ofNullable(config.getProxySelector()).ifPresent(builder::proxy);
Optional.ofNullable(config.getSslContext()).ifPresent(builder::sslContext);
//构建HttpClient
client = builder.build();
}
}
}
private HttpRequest buildRequest() {
HttpRequest.Builder builder = HttpRequest.newBuilder();
builder.uri(config.getRequestURI());
builder.uri(config.getRequestURI());
builder.timeout(config.getConnectTimeout());
builder.headers(buildHeader());
builder.version(config.getVersion());
builder.method(config.getRequestMethod(),buildPublisher());
return builder.build();
}
public HttpResult executeToString() throws IOException, InterruptedException {
HttpRequest request = buildRequest();
HashMap options = new HashMap<>();
options.put("startime", Instant.now());
HttpResponse response = client.send(request,HttpResponse.BodyHandlers.ofString(config.getResCode()));
options.put("endtime",Instant.now());
return new HttpResult<>(response.statusCode(), response.body(), client, request, response, options);
}
public HttpResult executeToFile(String pathStr) throws IOException, InterruptedException {
File file = new File(pathStr);
if (!file.exists()){
file.getParentFile().mkdirs();
}
HttpRequest request = buildRequest();
HttpResponse response = client.send(buildRequest(),HttpResponse.BodyHandlers.ofFile(file.toPath()));
return new HttpResult<>(response.statusCode(), file, client, request, response);
}
private String[] buildHeader(){
Map headerMap = config.getHeaderMap();
String[] headers = new String[headerMap.size() * 2];
int index = 0;
for (Map.Entry entry:
headerMap.entrySet()) {
headers[index++] = entry.getKey();
headers[index++] = entry.getValue();
}
return headers;
}
private HttpRequest.BodyPublisher buildPublisher(){
String paramsStr = config.paramsToString();
if (paramsStr != null) {
return HttpRequest.BodyPublishers.ofString(paramsStr,config.getReqCode());
}
return HttpRequest.BodyPublishers.noBody();
}
public static HttpConfig builder(){
return new HttpConfig();
}
}
发现了嘛?HttpExplore和HttpConfig紧密结合,谁也离不开谁,这样我们在使用工具时就自然而然的去先做配置,这就是构造者模型的好处。
另外,JDK的HttpClient的优势在于,无论时get还是post或者put或者delete,其实就是一个参数,传参的方式也高度一致,是不是很方便?
此外,特别要注意的是字符编码问题,请求编码和响应编码一定要记得设置,否则乱码问题很头疼。
响应封装
请求工具封装完成,我们一般就可以获取远程站点内容,但是如何解析内容也是一个问题(发现没有,我们至今除了使用JDK外,没有使用过其他任何第三方工具包),因此为了方便,我们这里使用Jsoup来解析响应的HTML内容。Jsoup可以将HTML内容转换成节点,我们直接可以获取节点内容,方便快捷、干净又卫生。
HttpResult.java
package com.vtarj.pythagoras.explore; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.util.HashMap; public class HttpResult{ private final int code; private final T data; private final HttpClient client; private final HttpRequest request; private final HttpResponse response; private final HashMap options; public HttpResult(int code, T data, HttpClient client, HttpRequest request, HttpResponse response, HashMap options) { this.code = code; this.data = data; this.client = client; this.request = request; this.response = response; this.options = options; } public HttpResult(int code, T data, HttpClient client, HttpRequest request, HttpResponse response) { this.code = code; this.data = data; this.client = client; this.request = request; this.response = response; this.options = null; } public int getCode() { return code; } public T getData() { return data; } public HttpClient getClient() { return client; } public HttpRequest getRequest() { return request; } public HttpResponse getResponse() { return response; } public HashMap getOptions() { return options; } @Override public String toString() { return "HttpResult{" + "code=" + code + ", data=" + data + ", request=" + request + ", response=" + response + '}'; } }
响应结果,我们尽可能也保留原文内容,这样防止我们封装时考虑不周导致后面相关信息遗失,因此建议增加一个可自定义的option字段,用于自定义存放内容。
未完待续~~~
上一篇:实战:纯手工打造Java爬虫——基于JDK11原生HttpClient(二)
下一篇:实战:纯手工打造Java爬虫——基于JDK11原生HttpClient(四)



