OP的样本PDF中的 不可见文本 通常通过定义剪切路径(超出文本范围)和填充路径(将文本隐藏在下方)而变得 不可见
。因此,我们必须在文本提取期间考虑与路径相关的指令,以忽略该 不可见的文本 。
不幸的是,没有为这些指令设计的回调在
PDFTextStripper其父类
LegacyPDFStreamEngine和中声明
PDFStreamEngine。
但是它们在其他主要
PDFStreamEngine子类中声明
PDFGraphicsStreamEngine,并在
PageDrawer。
因此,为了利用这一点,我们可以将
PageDrawer实现复制并粘贴并改编为的子类
PDFTextStripper,例如:
public class PDFVisibleTextStripper extends PDFTextStripper { public PDFVisibleTextStripper() throws IOException { addOperator(new AppendRectangleToPath()); addOperator(new ClipEvenOddRule()); addOperator(new ClipNonZeroRule()); addOperator(new ClosePath()); addOperator(new CurveTo()); addOperator(new CurveToReplicateFinalPoint()); addOperator(new CurveToReplicateInitialPoint()); addOperator(new EndPath()); addOperator(new FillEvenOddAndStrokePath()); addOperator(new FillEvenOddRule()); addOperator(new FillNonZeroAndStrokePath()); addOperator(new FillNonZeroRule()); addOperator(new LineTo()); addOperator(new MoveTo()); addOperator(new StrokePath()); } @Override protected void processTextPosition(TextPosition text) { Matrix textMatrix = text.getTextMatrix(); Vector start = textMatrix.transform(new Vector(0, 0)); Vector end = new Vector(start.getX() + text.getWidth(), start.getY()); PDGraphicsState gs = getGraphicsState(); Area area = gs.getCurrentClippingPath(); if (area == null || (area.contains(start.getX(), start.getY()) && area.contains(end.getX(), end.getY()))) super.processTextPosition(text); } private GeneralPath linePath = new GeneralPath(); void deleteCharsInPath() { for (List<TextPosition> list : charactersByArticle) { List<TextPosition> toRemove = new ArrayList<>(); for (TextPosition text : list) { Matrix textMatrix = text.getTextMatrix(); Vector start = textMatrix.transform(new Vector(0, 0)); Vector end = new Vector(start.getX() + text.getWidth(), start.getY()); if (linePath.contains(start.getX(), start.getY()) || linePath.contains(end.getX(), end.getY())) { toRemove.add(text); } } if (toRemove.size() != 0) { System.out.println(toRemove.size()); list.removeAll(toRemove); } } } public final class AppendRectangleToPath extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { if (operands.size() < 4) { throw new MissingOperandException(operator, operands); } if (!checkArrayTypesClass(operands, COSNumber.class)) { return; } COSNumber x = (COSNumber) operands.get(0); COSNumber y = (COSNumber) operands.get(1); COSNumber w = (COSNumber) operands.get(2); COSNumber h = (COSNumber) operands.get(3); float x1 = x.floatValue(); float y1 = y.floatValue(); // create a pair of coordinates for the transformation float x2 = w.floatValue() + x1; float y2 = h.floatValue() + y1; Point2D p0 = context.transformedPoint(x1, y1); Point2D p1 = context.transformedPoint(x2, y1); Point2D p2 = context.transformedPoint(x2, y2); Point2D p3 = context.transformedPoint(x1, y2); // to ensure that the path is created in the right direction, we have to create // it by combining single lines instead of creating a simple rectangle linePath.moveTo((float) p0.getX(), (float) p0.getY()); linePath.lineTo((float) p1.getX(), (float) p1.getY()); linePath.lineTo((float) p2.getX(), (float) p2.getY()); linePath.lineTo((float) p3.getX(), (float) p3.getY()); // close the subpath instead of adding the last line so that a possible set line // cap style isn't taken into account at the "beginning" of the rectangle linePath.closePath(); } @Override public String getName() { return "re"; } } public final class StrokePath extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.reset(); } @Override public String getName() { return "S"; } } public final class FillEvenOddRule extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); deleteCharsInPath(); linePath.reset(); } @Override public String getName() { return "f*"; } } public class FillNonZeroRule extends OperatorProcessor { @Override public final void process(Operator operator, List<COSbase> operands) throws IOException { linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); deleteCharsInPath(); linePath.reset(); } @Override public String getName() { return "f"; } } public final class FillEvenOddAndStrokePath extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); deleteCharsInPath(); linePath.reset(); } @Override public String getName() { return "B*"; } } public class FillNonZeroAndStrokePath extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); deleteCharsInPath(); linePath.reset(); } @Override public String getName() { return "B"; } } public final class ClipEvenOddRule extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); getGraphicsState().intersectClippingPath(linePath); } @Override public String getName() { return "W*"; } } public class ClipNonZeroRule extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); getGraphicsState().intersectClippingPath(linePath); } @Override public String getName() { return "W"; } } public final class MoveTo extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { if (operands.size() < 2) { throw new MissingOperandException(operator, operands); } COSbase base0 = operands.get(0); if (!(base0 instanceof COSNumber)) { return; } COSbase base1 = operands.get(1); if (!(base1 instanceof COSNumber)) { return; } COSNumber x = (COSNumber) base0; COSNumber y = (COSNumber) base1; Point2D.Float pos = context.transformedPoint(x.floatValue(), y.floatValue()); linePath.moveTo(pos.x, pos.y); } @Override public String getName() { return "m"; } } public class LineTo extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { if (operands.size() < 2) { throw new MissingOperandException(operator, operands); } COSbase base0 = operands.get(0); if (!(base0 instanceof COSNumber)) { return; } COSbase base1 = operands.get(1); if (!(base1 instanceof COSNumber)) { return; } // append straight line segment from the current point to the point COSNumber x = (COSNumber) base0; COSNumber y = (COSNumber) base1; Point2D.Float pos = context.transformedPoint(x.floatValue(), y.floatValue()); linePath.lineTo(pos.x, pos.y); } @Override public String getName() { return "l"; } } public class CurveTo extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { if (operands.size() < 6) { throw new MissingOperandException(operator, operands); } if (!checkArrayTypesClass(operands, COSNumber.class)) { return; } COSNumber x1 = (COSNumber) operands.get(0); COSNumber y1 = (COSNumber) operands.get(1); COSNumber x2 = (COSNumber) operands.get(2); COSNumber y2 = (COSNumber) operands.get(3); COSNumber x3 = (COSNumber) operands.get(4); COSNumber y3 = (COSNumber) operands.get(5); Point2D.Float point1 = context.transformedPoint(x1.floatValue(), y1.floatValue()); Point2D.Float point2 = context.transformedPoint(x2.floatValue(), y2.floatValue()); Point2D.Float point3 = context.transformedPoint(x3.floatValue(), y3.floatValue()); linePath.curveTo(point1.x, point1.y, point2.x, point2.y, point3.x, point3.y); } @Override public String getName() { return "c"; } } public final class CurveToReplicateFinalPoint extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { if (operands.size() < 4) { throw new MissingOperandException(operator, operands); } if (!checkArrayTypesClass(operands, COSNumber.class)) { return; } COSNumber x1 = (COSNumber) operands.get(0); COSNumber y1 = (COSNumber) operands.get(1); COSNumber x3 = (COSNumber) operands.get(2); COSNumber y3 = (COSNumber) operands.get(3); Point2D.Float point1 = context.transformedPoint(x1.floatValue(), y1.floatValue()); Point2D.Float point3 = context.transformedPoint(x3.floatValue(), y3.floatValue()); linePath.curveTo(point1.x, point1.y, point3.x, point3.y, point3.x, point3.y); } @Override public String getName() { return "y"; } } public class CurveToReplicateInitialPoint extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { if (operands.size() < 4) { throw new MissingOperandException(operator, operands); } if (!checkArrayTypesClass(operands, COSNumber.class)) { return; } COSNumber x2 = (COSNumber) operands.get(0); COSNumber y2 = (COSNumber) operands.get(1); COSNumber x3 = (COSNumber) operands.get(2); COSNumber y3 = (COSNumber) operands.get(3); Point2D currentPoint = linePath.getCurrentPoint(); Point2D.Float point2 = context.transformedPoint(x2.floatValue(), y2.floatValue()); Point2D.Float point3 = context.transformedPoint(x3.floatValue(), y3.floatValue()); linePath.curveTo((float) currentPoint.getX(), (float) currentPoint.getY(), point2.x, point2.y, point3.x, point3.y); } @Override public String getName() { return "v"; } } public final class ClosePath extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.closePath(); } @Override public String getName() { return "h"; } } public final class EndPath extends OperatorProcessor { @Override public void process(Operator operator, List<COSbase> operands) throws IOException { linePath.reset(); } @Override public String getName() { return "n"; } }}( PDFVisibleTextStripper)
请确保您在PDFVisibleTextStripper
构造函数中使用内部运算符类,而不要使用PageDrawer
具有相同名称的类。为确保简单,请点击代码下的链接。
这将输出减少到
REVERSE tEaSER caRd500electionser of Teamst Bet1,000MARK BOX AS SHOWN DENOTES HOME TEAMPRO FOOTBALL - THURSDAY, SEPTEMBER 8, 2016 1 PANTHERS nbc - 10½ 8:30p 2 BRonCOS - 3½ PRO FOOTBALL - SUNDAY, SEPTEMBER 11, 2016 3 FALCONS - 9½ 1:00p 4 BUCCANEERS - 4½ 5 VIKINGS - 9½ 1:00p 6 TITANS - 4½ 7 EAGLES - 10½ 1:00p 8 BROWNS - 3½ 9 BENGALS - 9½ 1:00p 10 JETS - 4½ 11 SAINTS - 7½ 1:00p 12 RAIDERS - 6½ 13 CHIEFS - 14½ 1:00p 14 CHARGERS + ½ 15 RAVENS - 10½ 1:00p 16 BILLS - 3½ 17 TEXANS - 14½ 1:00p 18 BEARS + ½ 19 PACKERS - 12½ 1:00p 20 JAGUARS - 1½ 21 SEAHAWKS - 17½ 4:05p 22 DOLPHINS + 3½ 23 COWBOYS - 7½ 4:25p 24 GIANTS - 6½ 25 COLTS - 10½ 4:25p 26 LIONS - 3½ 27 CARDINALS nbc - 14½ 8:30p 28 PATRIOTS + ½ PRO FOOTBALL - MONDAY, SEPTEMBER 12, 2016 29 STEELERS espn - 10½ 7:10p 30 REDSKINS - 3½ 31 RAMS espn - 9½ 10:20p 32 49ERS - 4½
这会丢弃大多数不需要的数据。
在
此问题的上下文中,很明显的是,字符基线的计算方式processTextPosition
和deleteCharsInPath
结尾隐含地假定了水平文本而没有页面旋转。但是,如果放宽了“可见性”的标准,则可以假定一个字符是可见的,前提是该字符的基线开始可见。在那种情况下,不再需要计算出来Vectorend
的代码,并且代码对于旋转的页面也可以正常工作。
在
此问题的上下文中,很明显,由于浮点计算错误,正好在剪切路径边界上的字形原点坐标可能会在剪切路径之外徘徊。事实证明,切换到“胖点坐标检查”是可以接受的解决方法。



