实现了一个简单的爬虫
一、功能 爬取壁纸图片(要求大于1M) 二、待完善: 1. 关键字爬取 (按壁纸类型关键字爬取) 2. 网址筛选(剔除收集到的未爬但无用网址) 3. list没有进行很好的处理,叠加扩容未处理 三、结语新手上路,请多关照!(手动滑稽)!
maven相关依赖下面是源代码:org.jsoup jsoup 1.13.1
import org.jsoup.Jsoup;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Robot {
//起始爬取网址
private static String startUrl="http://www.netbian.com/desk/18321.htm";
//爬取结果保存位置
private static String saveUrl="D:\pachong\";
//匹配图片的正则
private static String imgRegex=";
//匹配路径的正则
private static String urlRegex=";
//已经爬虫过的网址list
private static List reptiledList=new ArrayList<>();
//收集到的爬虫网址list
private static List reptileList=new ArrayList<>();
public static void main(String[] args) {
System.out.println("开始爬虫!");
robot(startUrl);
}
public static void robot(String url){
if (reptiledList.contains(url)){
return;
}
System.out.println("爬取地址:"+url);
reptiledList.add(url);
List resultList = getResultList(url);
for (String result : resultList) {
//过滤小于1M的文件
if (fileLengthOut1M(result)){
//下载
downloadImg(result,saveUrl);
}
// //不过滤图片大小
// downloadImg(result,saveUrl);
}
List nextUrlList = getNextUrlList(url);
reptileList.addAll(nextUrlList);
for (String nextUrl : reptileList) {
robot(nextUrl);
}
}
public static List getResultList(String url){
List list = new ArrayList<>();
String html=getHtml(url);
Pattern pattern = Pattern.compile(imgRegex);
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String group=matcher.group();
if (!group.contains("http")){
group=url+"/"+group;
}
String result = group.replace(", "").replace(""", "");
list.add(result);
}
return list;
}
public static List getNextUrlList(String url){
List list = new ArrayList<>();
String html=getHtml(url);
Pattern pattern = Pattern.compile(urlRegex);
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String group=matcher.group();
if (!group.contains("http")){
group=url+"/"+group;
}
String result =group.replace(", "").replace(""", "");
list.add(result);
}
return list;
}
public static void downloadImg(String resultUrl, String localPath){
URL newUrl = null;
HttpURLConnection hconnection = null;
InputStream inputStream = null;
FileOutputStream fileOutputStream = null;
byte[] bs = null;
try {
System.out.println("开始准备下载!");
newUrl = new URL(resultUrl);
hconnection = (HttpURLConnection) newUrl.openConnection(); //打开连接
inputStream = hconnection.getInputStream(); //获取流
bs = getBytesFromInputStream(inputStream); //流转btye[]
String outPutPath = localPath + resultUrl.substring(resultUrl.lastIndexOf("/")+1); //获取图片名称
System.out.println("图片路径:"+outPutPath);
fileOutputStream = new FileOutputStream(new File(outPutPath));
fileOutputStream.write(bs); //写出
System.out.println("下载成功!");
} catch (Exception e) {
System.out.println("下载失败!");
} finally {
System.out.println("===============================================================");
try {
inputStream.close();
fileOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static byte[] getBytesFromInputStream(InputStream inputStream){
byte[] bs = null;
try {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream arrayOutputStream = new ByteArrayOutputStream(); //
while((len = inputStream.read(buffer)) != -1){
arrayOutputStream.write(buffer, 0 ,len);
}
bs = arrayOutputStream.toByteArray();
} catch (IOException e) {
e.printStackTrace();
}
return bs;
}
public static String getHtml(String url){
String html = "";
try {
html = Jsoup.connect(url).execute().body();
} catch (IOException e) {
}
return html;
}
private static boolean fileLengthOut1M(String downloadUrl){
URL url = null;
HttpURLConnection conn = null;
try {
url = new URL(downloadUrl);
conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("HEAD");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows 7; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 YNoteCef/5.8.0.1 (Windows)");
long lo= (long) conn.getContentLength();
int i=(int)lo/1024/1024;
if (i>0){
return true;
}
} catch (IOException e) {
System.out.println("获取文件大小失败!");
return false;
} finally {
conn.disconnect();
}
return false;
}
}



