PDF
Box将拆分操作产生的零件作为PDdocument类型的对象存储为堆中的对象,这会导致堆快速填充,即使在循环的每一轮之后调用close()操作,GC仍会无法以与填充相同的方式回收堆大小。
一种选择是将文档拆分操作拆分为多个批次,其中每个批次是一个相对易于管理的块(10至40页)
public void execute() { File inputFile = new File(path/to/the/file.pdf); PDdocument document = null; try { document = PDdocument.load(inputFile); int start = 1; int end = 1; int batchSize = 50; int finalBatchSize = document.getNumberOfPages() % batchSize; int noOfBatches = document.getNumberOfPages() / batchSize; for (int i = 1; i <= noOfBatches; i++) { start = end; end = start + batchSize; System.out.println("Batch: " + i + " start: " + start + " end: " + end); split(document, start, end); } // handling the remaining start = end; end += finalBatchSize; System.out.println("Final Batch start: " + start + " end: " + end); split(document, start, end); } catch (IOException e) { e.printStackTrace(); } finally { //close the document }}private void split(PDdocument document, int start, int end) throws IOException { List<File> fileList = new ArrayList<File>(); Splitter splitter = new Splitter(); splitter.setStartPage(start); splitter.setEndPage(end); List<PDdocument> splitteddocuments = splitter.split(document); String outputPath = Config.INSTANCE.getProperty("outputPath"); PDFTextStripper stripper = new PDFTextStripper(); for (int index = 0; index < splitteddocuments.size(); index++) { String pdfFullPath = document.getdocumentInformation().getTitle() + index + start+ ".pdf"; PDdocument splitteddocument = splitteddocuments.get(index); splitteddocument.save(pdfFullPath); }}


