坦白说,我没想到会在不久的将来找到解决方案,而我打算放弃,但是我在此页面上偶然发现了一些:
http://en.wikipedia.org/wiki/MIME#Multipart_messages
http://msdn.microsoft.com/zh-
cn/library/ms527355%28EXCHG.10%29.aspx
虽然,乍一看不是很吸引人。但是,如果仔细看,您会得到线索。阅读此内容后,我启动了IE,并随机开始将页面另存为
*.mht文件。让我逐行走…
但是让我事先解释一下,我的最终目标是分离/提取
html内容并解析它…解决方案本身并不完整,因为它取决于保存时选择的
characterset或
encoding。但是即使它会以较小的麻烦提取单个文件…
我希望这对于任何试图解析/解压缩
*.mht/MHTML文件的人都是有用的:)
=======说明======== 来自mht文件
From: "Saved by Windows Internet Explorer 7"
它是用于保存文件的软件
Subject: GoogleDate: Tue, 13 Jul 2010 21:23:03 +0530MIME-Version: 1.0
主题,日期和哑剧版本……类似于邮件格式
Content-Type: multipart/related;type="text/html";
这部分告诉我们这是一个
multipart文档。包含多部分的文档将一个或多个不同的数据集组合在一个正文中,因此
multipartContent-
Type字段必须出现在实体的标题中。在这里,我们还可以看到类型为
"text/html"。
boundary="----=_NextPart_000_0007_01CB22D1.93BBD1A0"
这是最重要的部分。这是唯一的分隔符,它分为两个不同的部分(html,图像,css,脚本等)。 一旦
掌握了这一点,一切都变得很容易…现在,我只需要遍历文档并找出不同的部分,并根据它们的内容
Content-Transfer-Encoding(base64,quoted-printable等)保存它们。。。
样品
------=_NextPart_000_0007_01CB22D1.93BBD1A0 Content-Type: text/html; charset="utf-8" Content-Transfer-Encoding: quoted-printable Content-Location: http://www.google.com/webhp?sourceid=navclient&ie=UTF-8 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" =...
JAVA代码
用于定义常量的接口。
public interface IConstants { public String BOUNDARY = "boundary"; public String CHAR_SET = "charset"; public String CONTENT_TYPE = "Content-Type"; public String CONTENT_TRANSFER_ENCODING = "Content-Transfer-Encoding"; public String CONTENT_LOCATION = "Content-Location"; public String UTF8_BOM = "=EF=BB=BF"; public String UTF16_BOM1 = "=FF=FE"; public String UTF16_BOM2 = "=FE=FF";}主解析器类…
package com.test.mht.core;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileOutputStream;import java.io.FileReader;import java.io.OutputStreamWriter;import java.util.regex.Matcher;import java.util.regex.Pattern;import sun.misc.base64Deprer;public class MHTParser implements IConstants{ private File mhtFile; private File outputFolder; public MHTParser(File mhtFile, File outputFolder) { this.mhtFile = mhtFile; this.outputFolder = outputFolder; } public void decompress() throws Exception { BufferedReader reader = null; String type = ""; String encoding = ""; String location = ""; String filename = ""; String charset = "utf-8"; StringBuilder buffer = null; try { reader = new BufferedReader(new FileReader(mhtFile)); final String boundary = getBoundary(reader); if(boundary == null) throw new Exception("Failed to find document 'boundary'... Aborting"); String line = null; int i = 1; while((line = reader.readLine()) != null) { String temp = line.trim(); if(temp.contains(boundary)) { if(buffer != null) { writeBufferContentToFile(buffer,encoding,filename,charset); buffer = null; } buffer = new StringBuilder(); }else if(temp.startsWith(CONTENT_TYPE)) { type = getType(temp); }else if(temp.startsWith(CHAR_SET)) { charset = getCharSet(temp); }else if(temp.startsWith(CONTENT_TRANSFER_ENCODING)) { encoding = getEncoding(temp); }else if(temp.startsWith(CONTENT_LOCATION)) { location = temp.substring(temp.indexOf(":")+1).trim(); i++; filename = getFileName(location,type); }else { if(buffer != null) { buffer.append(line + "n"); } } } }finally { if(null != reader) reader.close(); } } private String getCharSet(String temp) { String t = temp.split("=")[1].trim(); return t.substring(1, t.length()-1); } private void writeBufferContentToFile(StringBuilder buffer,String encoding, String filename, String charset) throws Exception { if(!outputFolder.exists()) outputFolder.mkdirs(); byte[] content = null; boolean text = true; if(encoding.equalsIgnoreCase("base64")){ content = getbase64EnpredString(buffer); text = false; }else if(encoding.equalsIgnoreCase("quoted-printable")) { content = getQuotedPrintableString(buffer); } else content = buffer.toString().getBytes(); if(!text) { BufferedOutputStream bos = null; try { bos = new BufferedOutputStream(new FileOutputStream(filename)); bos.write(content); bos.flush(); }finally { bos.close(); } }else { BufferedWriter bw = null; try { bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), charset)); bw.write(new String(content)); bw.flush(); }finally { bw.close(); } } } private byte[] getQuotedPrintableString(StringBuilder buffer) { //Set<String> uniqueHex = new HashSet<String>(); //final Pattern p = Pattern.compile("(=\p{XDigit}{2})*"); String temp = buffer.toString().replaceAll(UTF8_BOM, "").replaceAll("=n", ""); //Matcher m = p.matcher(temp); //while(m.find()) { // uniqueHex.add(m.group()); //} //System.out.println(uniqueHex); //for (String hex : uniqueHex) { //temp = temp.replaceAll(hex, getASCIIValue(hex.substring(1))); //} return temp.getBytes(); } private byte[] getbase64EnpredString(StringBuilder buffer) throws Exception { return new base64Deprer().depreBuffer(buffer.toString()); } private String getFileName(String location, String type) { final Pattern p = Pattern.compile("(\w|_|-)+\.\w+"); String ext = ""; String name = ""; if(type.toLowerCase().endsWith("jpeg")) ext = "jpg"; else ext = type.split("/")[1]; if(location.endsWith("/")) { name = "main"; }else { name = location.substring(location.lastIndexOf("/") + 1); Matcher m = p.matcher(name); String fname = ""; while(m.find()) { fname = m.group(); } if(fname.trim().length() == 0) name = "unknown"; else return getUniqueName(fname.substring(0,fname.indexOf(".")), fname.substring(fname.indexOf(".") + 1, fname.length())); } return getUniqueName(name,ext); } private String getUniqueName(String name,String ext) { int i = 1; File file = new File(outputFolder,name + "." + ext); if(file.exists()) { while(true) { file = new File(outputFolder, name + i + "." + ext); if(!file.exists()) return file.getAbsolutePath(); i++; } } return file.getAbsolutePath(); } private String getType(String line) { return splitUsingcolonSpace(line); } private String getEncoding(String line){ return splitUsingcolonSpace(line); } private String splitUsingcolonSpace(String line) { return line.split(":\s*")[1].replaceAll(";", ""); } private String getBoundary(BufferedReader reader) throws Exception { String line = null; while((line = reader.readLine()) != null) { line = line.trim(); if(line.startsWith(BOUNDARY)) { return line.substring(line.indexOf(""") + 1, line.lastIndexOf(""")); } } return null; }}问候,



