替换100个模板中部分标签后,并合并100个pdf模板文档、10个400kb的图片为一个pdf文档
耗时20s左右
1.导入pdfbox
org.apache.pdfbox pdfbox2.0.1 log4j log4j1.2.17 maven-repository.junit junit4.13.2 fr.opensagres.xdocreport fr.opensagres.poi.xwpf.converter.pdf2.0.2
2.上代码
package main.java;
import fr.opensagres.poi.xwpf.converter.pdf.PdfConverter;
import fr.opensagres.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.log4j.Logger;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDdocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.poi.xwpf.usermodel.XWPFdocument;
import org.junit.Test;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
public class PdfboxSummary {
private final static Logger log = Logger.getLogger(PdfboxReplace.class);
@Test
public void pdfMergeONE() throws Exception {
//合并为一个文件的文件路径
String outputFile="D:\merged.pdf";
long start = System.currentTimeMillis();
System.out.println("===start==="+start);
//需要替换的标签数据key放标签,value放替换成哪个值
HashMap replaceMap = new HashMap();
replaceMap.put("<>","D1D1D1");
replaceMap.put("<>","F7F7F7");
replaceMap.put("<>","AnnualAnnualAnnual");
replaceMap.put("<>","E6E6E6E6E6");
replaceMap.put("<>","MonthMonthMonth");
replaceMap.put("<>","EffDateEffDateEffDate");
replaceMap.put("<>","R22R22R22R22");
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
//pdfMergerUtility.setDestinationFileName(FILEPATH + "test\merged.pdf");
PDdocument destination = new PDdocument();
//获取文件目录下要处理的文件名称列表
List fileNameList = getFile("D:\merge");
for (int i=0;i keyList = new ArrayList();
String pstring = "";
boolean isStart = false;
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
//Tj和TJ是在PDF中显示字符串的两个运算符
if (op.getName().equals("Tj")) {
// Tj takes one operator and that is the string to display so lets update that operator
//Tj是一种字符串形式的运算符,所以直接更新就行
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
string = string.replace(searchString, replacement);
previous.setValue(string.getBytes());
} else if (op.getName().equals("TJ")) {
//Tj是一种字符数组形式的运算符
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
//由于<>标签解析时,可能被解析成 "<>" 或 "<" "<" "A1" ">" ">"
//所以下面特殊处理下
//System.out.println(string);
if (pstring.contains("<<") || string.contains("<<")
|| pstring.contains("<") || string.contains("<")) {
pstring += string;
}
}
}
if (pstring.contains("<<")) {
isStart = true;
//System.out.println(pstring);
}
//if (searchString.equals(pstring.trim())) {
if (pstring.contains("<<") && pstring.contains(">>")
&&searchString.equals(pstring.trim())) {
System.out.println(pstring);
keyList.add(previous);
for (int i = 0; i < keyList.size(); i++) {
COSArray item = keyList.get(i);
if (i == 0) {
COSString cosString2 = (COSString) item.getObject(0);
cosString2.setValue(replacement.getBytes());
int total = item.size() - 1;
for (int k = total; k > 0; k--) {
item.remove(k);
}
} else {
while (item.size() > 0) {
item.remove(0);
}
}
}
keyList.clear();
pstring = "";
isStart = false;
} else {
if (isStart) {
keyList.add(previous);
}
}
}
if (pstring.contains(">>")) {
pstring = "";
isStart = false;
keyList.clear();
}
}
}
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
out.close();
page.setContents(updatedStream);
}
return document;
}
private static List getFile(String path) {
File file = new File(path);
//获取文件列表
File[] array = file.listFiles();
List fileNameList = new ArrayList<>(100);
for (int i = 0; i < array.length; i++) {
if (array[i].isFile()) {
fileNameList.add(array[i].getPath());
}
//else if (array[i].isDirectory()) {
// getFile(array[i].getPath());
//}
}
return fileNameList;
}
private static PDdocument insertToPdfByStream(PDdocument pdfdocument,String filePath) throws Exception {
//Iterator iterator = ImageIO.getImageReadersByFormatName("tiff");
Iterator iterator = ImageIO.getImageReadersByFormatName("jpeg");
if (!iterator.hasNext()) {
throw new Exception("The JDK does not support");
}
ImageReader imageReader = iterator.next();
long timeMillis = System.currentTimeMillis();
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream();) {
//ImageInputStream imageInputStream = ImageIO.createImageInputStream(new ByteArrayInputStream(tiffByte));
ImageInputStream imageInputStream = ImageIO.createImageInputStream(new FileInputStream(filePath));
imageReader.setInput(imageInputStream);
int size = imageReader.getNumImages(true);
for (int i = 0; i < size; i++) {
BufferedImage image = imageReader.read(i);
pageAddImage(pdfdocument, image);
}
pdfdocument.save(outputStream);
return pdfdocument;
//return outputStream.toByteArray();
} catch (IOException e) {
log.error("To PDF Page Error", e);
throw new Exception("Conversion PDF Error");
} finally {
log.info("to pdf used time: "+(System.currentTimeMillis() - timeMillis));
}
}
private static void pageAddImage(PDdocument newPdf, BufferedImage image) throws IOException {
//PDPage page = new PDPage(PDRectangle.A4);
PDPage page = new PDPage();
newPdf.addPage(page);
float width = page.getMediaBox().getWidth();
float height = page.getMediaBox().getHeight();
float scale = page.getMediaBox().getWidth() / image.getWidth();
scale = Math.min(1, scale);
float imgWidth = image.getWidth() * scale;
float imgHeight = image.getHeight() * scale;
try (PDPageContentStream pageContentStream = new PDPageContentStream(newPdf, page)) {
PDImageXObject pdImage = LosslessFactory.createFromImage(newPdf, image);
pageContentStream.drawImage(pdImage, (width - imgWidth) / 2, height - image.getHeight() * scale, imgWidth, imgHeight);
}
}
private static void wordToPdf(String docFilePath,String pdfFilePath) throws Exception {
InputStream docFile = new FileInputStream(docFilePath);
XWPFdocument doc = new XWPFdocument(docFile);
PdfOptions pdfOptions = PdfOptions.create();
OutputStream out = new FileOutputStream(pdfFilePath);
PdfConverter.getInstance().convert(doc, out, pdfOptions);
doc.close();
out.close();
System.out.println(pdfFilePath);
}
}
参考博文:
用 Java 中的 PDFbox 替换或删除 PDF 中的文本 - IT屋-程序员软件开发技术分享社区
https://www.cnblogs.com/tankqiu/articles/4246776.html
教程 - PDFBox 中文文档 - 文江博客
Word转为PDF(Java实现)_chengp919的博客-CSDN博客_java word转pdf



