如何在Java中读取或解析MHTML（.mht）文件

坦白说，我没想到会在不久的将来找到解决方案，而我打算放弃，但是我在此页面上偶然发现了一些：

http://en.wikipedia.org/wiki/MIME#Multipart_messages

http://msdn.microsoft.com/zh-
cn/library/ms527355%28EXCHG.10%29.aspx

虽然，乍一看不是很吸引人。但是，如果仔细看，您会得到线索。阅读此内容后，我启动了IE，并随机开始将页面另存为

*.mht

文件。让我逐行走…

但是让我事先解释一下，我的最终目标是分离/提取

html

内容并解析它…解决方案本身并不完整，因为它取决于保存时选择的

characterset

或

encoding

。但是即使它会以较小的麻烦提取单个文件…

我希望这对于任何试图解析/解压缩

*.mht/MHTML

文件的人都是有用的:)

=======说明======== 来自mht文件

From: "Saved by Windows Internet Explorer 7"

它是用于保存文件的软件

Subject: GoogleDate: Tue, 13 Jul 2010 21:23:03 +0530MIME-Version: 1.0

主题，日期和哑剧版本……类似于邮件格式

  Content-Type: multipart/related;type="text/html";

这部分告诉我们这是一个

multipart

文档。包含多部分的文档将一个或多个不同的数据集组合在一个正文中，因此

multipart

Content-
Type字段必须出现在实体的标题中。在这里，我们还可以看到类型为

"text/html"

。

boundary="----=_NextPart_000_0007_01CB22D1.93BBD1A0"

这是最重要的部分。这是唯一的分隔符，它分为两个不同的部分（html，图像，css，脚本等）。一旦
掌握了这一点，一切都变得很容易…现在，我只需要遍历文档并找出不同的部分，并根据它们的内容

Content-Transfer-Encoding

（base64，quoted-printable等）保存它们。。。

样品

 ------=_NextPart_000_0007_01CB22D1.93BBD1A0 Content-Type: text/html; charset="utf-8" Content-Transfer-Encoding: quoted-printable Content-Location: http://www.google.com/webhp?sourceid=navclient&ie=UTF-8 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" =...

JAVA代码

用于定义常量的接口。

public interface IConstants {    public String BOUNDARY = "boundary";    public String CHAR_SET = "charset";    public String CONTENT_TYPE = "Content-Type";    public String CONTENT_TRANSFER_ENCODING = "Content-Transfer-Encoding";    public String CONTENT_LOCATION = "Content-Location";    public String UTF8_BOM = "=EF=BB=BF";    public String UTF16_BOM1 = "=FF=FE";    public String UTF16_BOM2 = "=FE=FF";}

主解析器类…

package com.test.mht.core;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileOutputStream;import java.io.FileReader;import java.io.OutputStreamWriter;import java.util.regex.Matcher;import java.util.regex.Pattern;import sun.misc.base64Deprer;public class MHTParser implements IConstants{    private File mhtFile;    private File outputFolder;    public MHTParser(File mhtFile, File outputFolder) {        this.mhtFile = mhtFile;        this.outputFolder = outputFolder;    }        public void decompress() throws Exception    {        BufferedReader reader = null;        String type = "";        String encoding = "";        String location = "";        String filename = "";        String charset = "utf-8";        StringBuilder buffer = null;        try        { reader = new BufferedReader(new FileReader(mhtFile)); final String boundary = getBoundary(reader); if(boundary == null)     throw new Exception("Failed to find document 'boundary'... Aborting"); String line = null; int i = 1; while((line = reader.readLine()) != null) {     String temp = line.trim();     if(temp.contains(boundary))      {         if(buffer != null) {  writeBufferContentToFile(buffer,encoding,filename,charset);  buffer = null;         }         buffer = new StringBuilder();     }else if(temp.startsWith(CONTENT_TYPE)) {         type = getType(temp);     }else if(temp.startsWith(CHAR_SET)) {         charset = getCharSet(temp);     }else if(temp.startsWith(CONTENT_TRANSFER_ENCODING)) {         encoding = getEncoding(temp);     }else if(temp.startsWith(CONTENT_LOCATION)) {         location = temp.substring(temp.indexOf(":")+1).trim();         i++;         filename = getFileName(location,type);     }else {         if(buffer != null) {  buffer.append(line + "n");         }     } }        }finally         { if(null != reader)     reader.close();        }    }    private String getCharSet(String temp)     {        String t = temp.split("=")[1].trim();        return t.substring(1, t.length()-1);    }        private void writeBufferContentToFile(StringBuilder buffer,String encoding, String filename, String charset)     throws Exception    {        if(!outputFolder.exists()) outputFolder.mkdirs();        byte[] content = null;        boolean text = true;        if(encoding.equalsIgnoreCase("base64")){ content = getbase64EnpredString(buffer); text = false;        }else if(encoding.equalsIgnoreCase("quoted-printable")) { content = getQuotedPrintableString(buffer);      }        else content = buffer.toString().getBytes();        if(!text)        { BufferedOutputStream bos = null; try {     bos = new BufferedOutputStream(new FileOutputStream(filename));     bos.write(content);     bos.flush(); }finally {     bos.close(); }        }else         { BufferedWriter bw = null; try {     bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), charset));     bw.write(new String(content));     bw.flush(); }finally {     bw.close(); }        }    }        private byte[] getQuotedPrintableString(StringBuilder buffer)     {        //Set<String> uniqueHex = new HashSet<String>();        //final Pattern p = Pattern.compile("(=\p{XDigit}{2})*");        String temp = buffer.toString().replaceAll(UTF8_BOM, "").replaceAll("=n", "");        //Matcher m = p.matcher(temp);        //while(m.find()) {        //  uniqueHex.add(m.group());        //}        //System.out.println(uniqueHex);        //for (String hex : uniqueHex) { //temp = temp.replaceAll(hex, getASCIIValue(hex.substring(1)));        //}        return temp.getBytes();    }            private byte[] getbase64EnpredString(StringBuilder buffer) throws Exception {        return new base64Deprer().depreBuffer(buffer.toString());    }        private String getFileName(String location, String type)     {        final Pattern p = Pattern.compile("(\w|_|-)+\.\w+");        String ext = "";        String name = "";        if(type.toLowerCase().endsWith("jpeg")) ext = "jpg";        else ext = type.split("/")[1];        if(location.endsWith("/")) { name = "main";        }else        { name = location.substring(location.lastIndexOf("/") + 1); Matcher m = p.matcher(name); String fname = ""; while(m.find()) {     fname = m.group(); } if(fname.trim().length() == 0)     name = "unknown"; else     return getUniqueName(fname.substring(0,fname.indexOf(".")), fname.substring(fname.indexOf(".") + 1, fname.length()));        }        return getUniqueName(name,ext);    }        private String getUniqueName(String name,String ext)    {        int i = 1;        File file = new File(outputFolder,name + "." + ext);        if(file.exists())        { while(true) {     file = new File(outputFolder, name + i + "." + ext);     if(!file.exists())         return file.getAbsolutePath();     i++; }        }        return file.getAbsolutePath();    }    private String getType(String line) {        return splitUsingcolonSpace(line);    }    private String getEncoding(String line){        return splitUsingcolonSpace(line);    }    private String splitUsingcolonSpace(String line) {        return line.split(":\s*")[1].replaceAll(";", "");    }        private String getBoundary(BufferedReader reader) throws Exception     {        String line = null;        while((line = reader.readLine()) != null)        { line = line.trim(); if(line.startsWith(BOUNDARY)) {     return line.substring(line.indexOf(""") + 1, line.lastIndexOf(""")); }        }        return null;    }}

问候，

如何在Java中读取或解析MHTML（.mht）文件

面试问答相关栏目本月热门文章