栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Java

java正则表达式解析html示例分享

Java 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

java正则表达式解析html示例分享

复制代码 代码如下:
package work;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class chuanboyi {

 public static void main(String[] args){
  // TODO Auto-generated method stub
  StringBuffer html = new StringBuffer();
  HttpClient httpclient = new HttpClient();
  //创建GET方法实例
  GetMethod getMethod = new GetMethod("https://www.jb51.net");
  //使用系统提供的默认恢复策略
  getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
  try{
   //执行GET方法
   int statusCode = httpclient.executeMethod(getMethod);
   if(statusCode != HttpStatus.SC_OK){
    System.out.println("Method is wrong " + getMethod.getStatusLine());
   }
   InputStream responseBody = getMethod.getResponseBodyAsStream();
   BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,"utf-8"));
   String line = reader.readLine();
   while(line != null){
    html.append(line).append("n");
    line = reader.readLine();
   }
   reader.close();
   //正则表达式
   String regex = ".*";
   String regexa ="(?<=

  • )[\s\S]+?(?=
  • )";
       Pattern pattern = Pattern.compile(regex);
             Matcher m = pattern.matcher(html);
             StringBuffer str = new StringBuffer();
             int i = 0;
             while(m.find()){
              str.append(m.group());
             }
             pattern = Pattern.compile(regexa);
             m = pattern.matcher(str);
             while(m.find()){
              attrs(m.group());
              i++;
             }
             System.out.println("共有"+i+"条数据!");
      }catch (HttpException e) {
       // TODO: handle exception
       System.out.println("Please check your provided http address!");
       e.printStackTrace();
      }catch (IOException e) {
       // TODO: handle exception
       System.out.println("the line is wrong!");
       e.printStackTrace();
      }finally{
       getMethod.releaseConnection();//释放链接
      }
     }
     public static void attrs(String str){

      //获取url的正则表达式
      String regexURL = "[a-z]+-[0-9]+\.html";
      //获取Name的正则表达式
      String regexName = "(?<=title=")[[\w-\s][^x00-xff]]+(?=")";
      //获取图片的正则表达式
      String regexPicture = "images.*\.jpg";

      Pattern patternURL = Pattern.compile(regexURL);
      Pattern patternName = Pattern.compile(regexName);
      Pattern patternPicture = Pattern.compile(regexPicture);
      Matcher mURL = patternURL.matcher(str);
      Matcher mName = patternName.matcher(str);
      Matcher mPicture = patternPicture.matcher(str);
      if(mName.find()){
       System.out.println("名字:"+mName.group());
      }
      if(mURL.find()){
       System.out.println("链接:"+mURL.group());
      }
      if(mPicture.find()){
       System.out.println("图片:"+mPicture.group());
      }
     } 
    }

    转载请注明:文章转载自 www.mshxw.com
    本文地址:https://www.mshxw.com/it/152447.html
    我们一直用心在做
    关于我们 文章归档 网站地图 联系我们

    版权所有 (c)2021-2022 MSHXW.COM

    ICP备案号:晋ICP备2021003244-6号