package org.docear.plugin.pdfutilities.pdf; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; import de.intarsys.pdf.content.ICSInterpreter; import de.intarsys.pdf.content.text.CSCharacterParser; import de.intarsys.pdf.cos.COSName; import de.intarsys.pdf.font.PDFont; import de.intarsys.pdf.font.PDGlyphs; /** * A still very simple text extraction utility for PDF documents. */ public class TextExtractor extends CSCharacterParser { private StringBuilder content; private double maxDX = 5; private double maxDY = 5; public TextExtractor() { super(); } @SuppressWarnings("unused") private void append(char c) { if (c > 0) { content.append(c); } else { content.append(' '); } } private void append(char[] chars) { content.append(chars); } private void append(String s) { content.append(s); } public String getContent() { return content.toString(); } @Override protected void onCharacterFound(PDGlyphs glyphs, Rectangle2D rect) { char[] chars = glyphs.getChars(); if (chars == null) { chars = new char[] { ' ' }; } double dX = lastStopX - lastStartX; double dY = lastStopY - lastStartY; if (Math.abs(dX) < Math.abs(maxDX)) { if (Math.abs(dY) > Math.abs(maxDY) && content.length() > 0) { append(System.getProperty("line.separator")); } } else { if (content.length() > 0) { if (Math.abs(dY) < Math.abs(maxDY)) { append(" "); } else { append(System.getProperty("line.separator")); } } } append(chars); } @Override public void open(ICSInterpreter pInterpreter) { super.open(pInterpreter); content = new StringBuilder(); } @Override public void textSetFont(COSName name, PDFont font, float size) { super.textSetFont(name, font, size); AffineTransform tx; tx = (AffineTransform) getDeviceTransform().clone(); tx.concatenate(textState.globalTransform); maxDX = textState.fontSize * 0.2 * tx.getScaleX(); maxDY = textState.fontSize * 0.6 * tx.getScaleY(); } @Override public void textSetTransform(float a, float b, float c, float d, float e, float f) { super.textSetTransform(a, b, c, d, e, f); AffineTransform tx; tx = (AffineTransform) getDeviceTransform().clone(); tx.concatenate(textState.globalTransform); maxDX = textState.fontSize * 0.2 * tx.getScaleX(); maxDY = textState.fontSize * 0.6 * tx.getScaleY(); } @Override public void textT3SetGlyphWidth(float x, float y) { super.textT3SetGlyphWidth(x, y); AffineTransform tx; tx = (AffineTransform) getDeviceTransform().clone(); tx.concatenate(textState.globalTransform); maxDX = textState.fontSize * 0.2 * tx.getScaleX(); maxDY = textState.fontSize * 0.6 * tx.getScaleY(); } @Override public void textT3SetGlyphWidthBB(float x, float y, float llx, float lly, float urx, float ury) { super.textT3SetGlyphWidthBB(x, y, llx, lly, urx, ury); AffineTransform tx; tx = (AffineTransform) getDeviceTransform().clone(); tx.concatenate(textState.globalTransform); maxDX = textState.fontSize * 0.2 * tx.getScaleX(); maxDY = textState.fontSize * 0.6 * tx.getScaleY(); } }