栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Java

Java实现Word/Pdf/TXT转html的实例代码

Java 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

Java实现Word/Pdf/TXT转html的实例代码

引言:

最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成HTML文件,以便在网页上能够浏览学习

下边主要针对word,pdf和txt文本文件进行转换

一:Java实现将word转换为html

1:引入依赖

 
  fr.opensagres.xdocreport
  fr.opensagres.xdocreport.document
  1.0.5
 
  
  fr.opensagres.xdocreport 
  org.apache.poi.xwpf.converter.xhtml 
  1.0.5 
 
  
  org.apache.poi
  poi
  3.12
 
 
  org.apache.poi
  poi-scratchpad
  3.12
 

2:代码demo

package com.svse.controller;
  
  import javax.xml.parsers.documentBuilderFactory;
  import javax.xml.parsers.ParserConfigurationException;
  import javax.xml.transform.OutputKeys;
  import javax.xml.transform.Transformer;
  import javax.xml.transform.TransformerException;
  import javax.xml.transform.TransformerFactory;
  import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.poi.hwpf.HWPFdocument;
 import org.apache.poi.hwpf.converter.PicturesManager;
 import org.apache.poi.hwpf.converter.WordToHtmlConverter;
 import org.apache.poi.hwpf.usermodel.PictureType;
 import org.apache.poi.xwpf.converter.core.BasicURIResolver;
 import org.apache.poi.xwpf.converter.core.FileImageExtractor;
 import org.apache.poi.xwpf.converter.core.FileURIResolver;
 import org.apache.poi.xwpf.converter.core.IURIResolver;
 import org.apache.poi.xwpf.converter.core.IXWPFConverter;
 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
 import org.apache.poi.xwpf.usermodel.XWPFdocument;
 
public class TestWordToHtml {
 
  public static final String STORAGEPATH="C://works//files//";
   public static final String IP="192.168.30.222";
   public static final String PORT="8010";
  public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {
   TestWordToHtml wt=new TestWordToHtml();
    //wt.Word2003ToHtml("甲骨文考证.doc");
    wt.Word2007ToHtml("甲骨文考证.docx");

  }
    
   
  public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {
    
     final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片 图片会保存在此路径
    final String strRanString=getRandomNum();
    String filepath =STORAGEPATH;
    String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";
    final String file = filepath + fileName;
    InputStream input = new FileInputStream(new File(file));
    HWPFdocument worddocument = new HWPFdocument(input);
    WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(documentBuilderFactory.newInstance().newdocumentBuilder().newdocument());
    //设置图片存放的位置
    wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
 File imgPath = new File(imagepath);
 if(!imgPath.exists()){//图片目录不存在则创建
   imgPath.mkdirs();
 }
  
 File file = new File(imagepath +strRanString+suggestedName);
 try {
    OutputStream os = new FileOutputStream(file);
    os.write(content);
    os.close();
 } catch (FileNotFoundException e) {
    e.printStackTrace();
  } catch (IOException e) {
    e.printStackTrace();
 }
  
 return "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;
// return imagepath +strRanString+suggestedName;
      }
    });
    
     //解析word文档
    wordToHtmlConverter.processdocument(worddocument);
     document htmldocument = wordToHtmlConverter.getdocument();
     
     File htmlFile = new File(filepath +strRanString+htmlName);
    OutputStream outStream = new FileOutputStream(htmlFile);
     

     DOMSource domSource = new DOMSource(htmldocument);
    StreamResult streamResult = new StreamResult(outStream);
 
    TransformerFactory factory = TransformerFactory.newInstance();
     Transformer serializer = factory.newTransformer();
    serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
    serializer.setOutputProperty(OutputKeys.INDENT, "yes");
    serializer.setOutputProperty(OutputKeys.METHOD, "html");
    
    serializer.transform(domSource, streamResult);
     outStream.close();
     
    System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
   }
   
   public void Word2007ToHtml(String fileName) throws IOException {
    final String strRanString=getRandomNum();
     String filepath = STORAGEPATH+strRanString;
     String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";
     File f = new File(STORAGEPATH+fileName); 
     if (!f.exists()) { 
System.out.println("Sorry File does not Exists!"); 
     } else { 
if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) { 
  try {
    // 1) 加载word文档生成 XWPFdocument对象 
    InputStream in = new FileInputStream(f); 
    XWPFdocument document = new XWPFdocument(in); 
    // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录) 
    File imageFolderFile = new File(filepath); 
    XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); 
    options.setExtractor(new FileImageExtractor(imageFolderFile)); 
    options.URIResolver(new IURIResolver() {
      public String resolve(String uri) {
 //http://192.168.30.222:8010//uploadFile/....
 return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;
      }
    });
    options.setIgnoreStylesIfUnused(false); 
    options.setFragment(true); 
    // 3) 将 XWPFdocument转换成XHTML 
    OutputStream out = new FileOutputStream(new File(filepath + htmlName)); 
    IXWPFConverter converter = XHTMLConverter.getInstance();
    converter.convert(document,out, options);
    //XHTMLConverter.getInstance().convert(document, out, options); 
    System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
  } catch (Exception e) {
    e.printStackTrace();
  }
} else { 
  System.out.println("Enter only MS Office 2007+ files"); 
} 
     } 
   } 
   
   public static String getRandomNum(){
     Date dt = new Date();
     SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss"); 
     String str=sdf.format(dt);
     return str;
   }
  }

二:Java实现将Pdf转换为html

1: 引入依赖

 
net.sf.cssbox
pdf2dom
1.7
      
     
org.apache.pdfbox
pdfbox
2.0.12
     
     
org.apache.pdfbox
pdfbox-tools
2.0.12
 

2:代码Demo

 public class PdfToHtml { 
  
   public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath) {
     // String outputPath = "C:\works\files\ZSQ保密知识测试题库.html";
//try() 写在()里面会自动关闭流
     try{
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));
//加载PDF文档
//PDdocument document = PDdocument.load(bytes);
PDdocument document = PDdocument.load(new File(inPdfPath));
PDFDomTree pdfDomTree = new PDFDomTree();
pdfDomTree.writeText(document,out);
     } catch (Exception e) {
e.printStackTrace();
     }
   }
   public static void main(String[] args) throws IOException {
     PdfToHtml ph=new PdfToHtml();
     String pdfPath="C:\works\files\武研中心行政考勤制度.pdf";
     String outputPath="C:\works\files\武研中心行政考勤制度.html";
     ph.pdfToHtmlTest(pdfPath,outputPath);
  }
 }

三:Java实现将TXT转换为html

 
   public static void txtToHtml(String filePath, String htmlPosition) {
     try {
//String encoding = "GBK";
File file = new File(filePath);
if (file.isFile() && file.exists()) { // 判断文件是否存在
  InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");
  // 考虑到编码格式
  BufferedReader bufferedReader = new BufferedReader(read);
  // 写文件
  FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
  OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");
  BufferedWriter bw = new BufferedWriter(osw);
  String lineTxt = null;
  while ((lineTxt = bufferedReader.readLine()) != null) {
    bw.write("  "+lineTxt + "
"); } bw.close(); osw.close(); fos.close(); read.close(); } else { System.out.println("找不到指定的文件"); } } catch (Exception e) { System.out.println("读取文件内容出错"); e.printStackTrace(); } }

总结

到此这篇关于Java实现Word/Pdf/TXT转html的实例代码的文章就介绍到这了,更多相关java word pdf txt 转html内容请搜索考高分网以前的文章或继续浏览下面的相关文章希望大家以后多多支持考高分网!

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/134987.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号