java操作PDF文件,可支持分页、合并、图片转PDF等
使用 Apache PDFBox 在 Java 中为 PDF 文档创建书签
PDFbox-PDF解析(坐标定位,分页读取)
功能实现 实现思路-
通过正则表达式匹配标题
-
通过标题和页面添加书签(只能定位到标题所在页)
pdfbox给pdf添加书签
java操作PDF,有一个很好用的工具——pdfbox。只需要引入依赖,即可使用。
org.apache.pdfbox pdfbox-app2.0.21
利用这个工具,可以实现很多的功能,我这里示例了以下几种:
-
加载PDF文档
-
创建一个单页的PDF空文档
-
获取PDF文档总页数
-
获取pdf文档的所有分页对象
-
给整个PDF文件分页,形成多个pdf单页文件
-
合并多个单页PDF文件,输出一个合并后的PDF文档
-
图片转PDF
-
获取pdf单页分辨率
代码如下:
package com.bridge.pdf.utils;
import com.bridge.enums.UtilsEnums;
import com.bridge.pdf.model.PdfBoxData;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.text.PDFTextStripper;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
@Slf4j
public class PDFBoxUtils {
public static void main(String[] args) throws IOException {
String savePath = "C:\Users\Administrator\Desktop\tmp\pdf\添加书签-" + System.currentTimeMillis() + ".pdf";
File file = new File("C:\Users\Administrator\Desktop\tmp\k8s尚硅谷\03_尚硅谷大数据技术之实时项目-需求一日活.pdf");
PDDocument pdDocument = PDFBoxUtils.load(file);
if (pdDocument == null) {
return;
}
List allBookList = new ArrayList<>();
int numberOfPages = pdDocument.getNumberOfPages();
for (int i = 1; i <= numberOfPages; i++) {
allBookList.addAll(PDFBoxUtils.getPdfBoxTextList(pdDocument, i));
}
addMarkBook(pdDocument,allBookList, savePath);
PDFBoxUtils.close(pdDocument);
}
public static void addMarkBook(PDDocument document, List allBookList, String savePath) throws IOException {
for (int i = 0; i < 10; i++) {
document.addPage(new PDPage());
}
PDDocumentOutline documentOutline = new PDDocumentOutline();
document.getDocumentCatalog().setDocumentOutline(documentOutline);
PDOutlineItem pagesOutline = new PDOutlineItem();
pagesOutline.setTitle("All Pages");
documentOutline.addLast(pagesOutline);
for (PdfBoxData pdfBoxData : allBookList) {
PDPageDestination pageDestination = new PDPageFitWidthDestination();
pageDestination.setPage(document.getPage(pdfBoxData.getPage()-1));
PDOutlineItem bookmark = new PDOutlineItem();
bookmark.setDestination(pageDestination);
bookmark.setTitle(pdfBoxData.getTitle());
pagesOutline.addLast(bookmark);
}
pagesOutline.openNode();
documentOutline.openNode();
document.getDocumentCatalog().setPageMode(PageMode.USE_OUTLINES);
document.save(savePath);
}
public static List getPdfBoxTextList(PDDocument document, int page) throws IOException {
//文本剥离器
PDFTextStripper stripper = new PDFTextStripper();
//按页进行读取,页码从1开始
stripper.setStartPage(page);
stripper.setEndPage(page);
//按位置进行排序
stripper.setSortByPosition(true);
//获取文本
String text = stripper.getText(document);
String[] dataArr = text.split("rn");
List pdfBoxDataList = new ArrayList<>();
for (String data : dataArr) {
if (data.matches(UtilsEnums.CHAPTER_TITLE_REGEX.getCode()) ||
data.matches(UtilsEnums.FIRST_TITLE_REGEX.getCode())) {
pdfBoxDataList.add(new PdfBoxData(data, page));
}
}
return pdfBoxDataList;
}
public static PDDocument load(File file) throws IOException {
if (!file.exists() || file.isDirectory()) {
return null;
}
return PDDocument.load(file);
}
public static PDDocument load(InputStream inputStream) throws IOException {
if (inputStream == null || inputStream.available() == 0) {
return null;
}
return PDDocument.load(inputStream);
}
public static PDDocument getBlankPDF(File outputFile) throws IOException {
//首先创建pdf文档类
PDDocument pdf = null;
pdf = new PDDocument();
//实例化pdf页对象
PDPage blankPage = new PDPage();
//插入文档类
pdf.addPage(blankPage);
//保存
pdf.save(outputFile);
return pdf;
}
public static int pageCount(PDDocument pdf) {
return pdf.getNumberOfPages();
}
public static List getPageList(PDDocument pdf) {
int count = pageCount(pdf);
List pages = new ArrayList<>(64);
PDPageTree pdPages = pdf.getPages();
for (int i = 0; i < count; i++) {
PDPage pdPage = pdPages.get(i);
pages.add(pdPage);
}
return pages;
}
public static Integer pageSpilt(InputStream inputStream, File outputParent) throws IOException {
if (!outputParent.exists() || !outputParent.isDirectory()) {
throw new RuntimeException("输出文件的父目录不存在");
}
PDDocument pdf = load(inputStream);
try {
int numberOfPages = pageCount(pdf);
for (int i = 0; i < numberOfPages; i++) {
PDDocument document = new PDDocument();
document.addPage(pdf.getPage(i));
document.save(new File(outputParent, i + 1 + ".pdf"));
close(document);
}
return numberOfPages;
} finally {
close(pdf);
close(inputStream);
}
}
public static void combine(File inputParent, String outputFile, FileSortor sortor) throws IOException {
if (!inputParent.exists() || !inputParent.isDirectory()) {
throw new RuntimeException("输入文件的父目录不存在");
}
if (new File(outputFile).exists()) {
throw new RuntimeException("输出文件已存在");
}
File[] files = inputParent.listFiles();
if (sortor != null) {
sortor.sort(files);
}
PDFMergerUtility merger = new PDFMergerUtility();
//输出目标路径
merger.setDestinationFileName(outputFile);
for (int i = 0; i < files.length; i++) {
if (files[i].getName().toLowerCase().endsWith(".pdf")) {
merger.addSource(files[i]);
}
}
merger.mergeDocuments(null);
}
public static String getResolution(PDPage page) {
PDRectangle rectangle = page.getArtBox();
double width = Math.ceil(rectangle.getWidth());
double height = Math.ceil(rectangle.getHeight());
return (int) width + "*" + (int) height;
}
public static void convertImgToPDF(String inputFile, String outputFile) throws IOException {
if (!new File(inputFile).exists()) {
throw new RuntimeException("输入文件不存在");
}
if (!outputFile.toLowerCase().endsWith(".pdf")) {
throw new RuntimeException("只能转成pdf文件");
}
PDDocument document = new PDDocument();
InputStream inputStream = new FileInputStream(inputFile);
BufferedImage bimg = ImageIO.read(inputStream);
float width = bimg.getWidth();
float height = bimg.getHeight();
PDPage page = new PDPage(new PDRectangle(width, height));
document.addPage(page);
PDImageXObject img = PDImageXObject.createFromFile(inputFile, document);
PDPageContentStream contentStream = new PDPageContentStream(document, page);
contentStream.drawImage(img, 0, 0, width, height);
contentStream.close();
close(inputStream);
document.save(outputFile);
close(document);
}
public static void close(InputStream inputStream) {
try {
if (inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
log.error(e.getMessage(), e);
}
}
public static void close(PDDocument pdf) {
try {
if (pdf != null) {
pdf.close();
}
} catch (IOException e) {
log.error(e.getMessage(), e);
}
}
public interface FileSortor {
void sort(File[] sources);
}
}



