/* * Copyright 2009 by Kevin Day. * * The contents of this file are subject to the Mozilla Public License Version 1.1 * (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the License. * * The Original Code is 'iText, a free JAVA-PDF library'. * * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. * All Rights Reserved. * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. * * Contributor(s): all the names of the contributors are added in the source code * where applicable. * * Alternatively, the contents of this file may be used under the terms of the * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the * provisions of LGPL are applicable instead of those above. If you wish to * allow use of your version of this file only under the terms of the LGPL * License and not to allow others to use your version of this file under * the MPL, indicate your decision by deleting the provisions above and * replace them with the notice and other provisions required by the LGPL. * If you do not delete the provisions above, a recipient may use your version * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. * * This library is free software; you can redistribute it and/or modify it * under the terms of the MPL as stated above or under the terms of the GNU * Library General Public License as published by the Free Software Foundation; * either version 2 of the License, or any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more * details. * * If you didn't download this code from the following link, you should check if * you aren't using an obsolete version: * http://www.lowagie.com/iText/ */ package com.lowagie.text.pdf.parser; /** * A simple text extraction renderer. * * This renderer keeps track of the current Y position of each string. If it detects * that the y position has changed, it inserts a line break into the output. If the * PDF renders text in a non-top-to-bottom fashion, this will result in the text not * being a true representation of how it appears in the PDF. * * This renderer also uses a simple strategy based on the font metrics to determine if * a blank space should be inserted into the output. * * @since 2.1.5 */ public class SimpleTextExtractingPdfContentRenderListener implements TextProvidingRenderListener { /** keeps track of the Y position of the last rendered text */ private float lastYPos; /** keeps track of the X position of the end of the last rendered text */ private float lastEndingXPos; private Matrix lastTextLineMatrix; private Vector lastStart; private Vector lastEnd; /** used to store the resulting String. */ private StringBuffer result; /** * Creates a new text extraction renderer. */ public SimpleTextExtractingPdfContentRenderListener() { reset(); } public void reset() { lastYPos = 0f; lastEndingXPos = 0f; lastTextLineMatrix = null; result = new StringBuffer(); } /** * Returns the result so far. * @return a String with the resulting text. */ public String getResultantText(){ return result.toString(); } /** * Writes text to the result. * @param text The text that needs to be displayed * @param gs The current graphics state, including the current font and various spacings needed to compute glyph widths * @param renderInto The rectangle that the result will be rendered into * @param spaceWidth The scaled width of a space character in the current font * @see com.lowagie.text.pdf.parser.RenderListener#renderText(String, GraphicsState, Rectangle) */ /** * Captures text using a simplified algorithm for inserting hard returns and spaces * @see com.lowagie.text.pdf.parser.AbstractRenderListener#renderText(java.lang.String, com.lowagie.text.pdf.parser.GraphicsState, com.lowagie.text.pdf.parser.Matrix, com.lowagie.text.pdf.parser.Matrix) */ public void renderText(TextRenderInfo renderInfo) { boolean firstRender = result.length() == 0; boolean hardReturn = false; Vector start = renderInfo.getStartPoint(); Vector end = renderInfo.getEndPoint(); if (!firstRender){ Vector x0 = start; Vector x1 = lastStart; Vector x2 = lastEnd; // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html float dist = (x2.subtract(x1)).cross((x1.subtract(x0))).lengthSquared() / x2.subtract(x1).lengthSquared(); float sameLineThreshold = 1f; // we should probably base this on the current font metrics, but 1 pt seems to be sufficient for the time being if (dist > sameLineThreshold) hardReturn = true; // Note: Technically, we should check both the start and end positions, in case the angle of the text changed without any displacement // but this sort of thing probably doesn't happen much in reality, so we'll leave it alone for now } if (hardReturn){ //System.out.println("<< Hard Return >>"); result.append('\n'); } else if (!firstRender){ if (result.charAt(result.length()-1) != ' ' && renderInfo.getText().charAt(0) != ' '){ // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space float spacing = lastEnd.subtract(start).length(); if (spacing > renderInfo.getSingleSpaceWidth()/2f){ result.append(' '); //System.out.println("Inserting implied space before '" + renderInfo.getText() + "'"); } } } else { //System.out.println("Displaying first string of content '" + text + "' :: x1 = " + x1); } //System.out.println("[" + renderInfo.getStartPoint() + "]->[" + renderInfo.getEndPoint() + "] " + renderInfo.getText()); result.append(renderInfo.getText()); lastStart = start; lastEnd = end; } }