您尝试过JTidy吗?
private String cleanData(String data) throws UnsupportedEncodingException { Tidy tidy = new Tidy(); tidy.setInputEncoding("UTF-8"); tidy.setOutputEncoding("UTF-8"); tidy.setPrintBodyonly(true); // only print the content tidy.setXmlOut(true); // to XML tidy.setSmartIndent(true); ByteArrayInputStream inputStream = new ByteArrayInputStream(data.getBytes("UTF-8")); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); tidy.parseDOM(inputStream, outputStream); return outputStream.toString("UTF-8");}虽然我认为它会修复某些情况下的HTML代码。



