/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.util; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.exceptions.WrappedIOException; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDMatrix; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType3Font; import org.apache.pdfbox.pdmodel.graphics.PDExtendedGraphicsState; import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState; import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject; import org.apache.pdfbox.util.operator.OperatorProcessor; /** * This class will run through a PDF content stream and execute certain operations * and provide a callback interface for clients that want to do things with the stream. * See the PDFTextStripper class for an example of how to use this class. * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.38 $ */ public class PDFStreamEngine { /** * Log instance. */ private static final Log log = LogFactory.getLog(PDFStreamEngine.class); /** * The PDF operators that are ignored by this engine. */ private final Set<String> unsupportedOperators = new HashSet<String>(); private static final byte[] SPACE_BYTES = { (byte)32 }; private PDGraphicsState graphicsState = null; private Matrix textMatrix = null; private Matrix textLineMatrix = null; private Stack<PDGraphicsState> graphicsStack = new Stack<PDGraphicsState>(); private Map<String,OperatorProcessor> operators = new HashMap<String,OperatorProcessor>(); private Stack<StreamResources> streamResourcesStack = new Stack<StreamResources>(); private PDPage page; private Map<String,PDFont> documentFontCache = new HashMap<String,PDFont>(); private int validCharCnt; private int totalCharCnt; /** * Flag to skip malformed or otherwise unparseable input where possible. */ private boolean forceParsing = false; /** * This is a simple internal class used by the Stream engine to handle the * resources stack. */ private static class StreamResources { private Map<String,PDFont> fonts; private Map<String,PDColorSpace> colorSpaces; private Map<String,PDXObject> xobjects; private Map<String,PDExtendedGraphicsState> graphicsStates; private PDResources resources; private StreamResources() {}; } /** * Constructor. */ public PDFStreamEngine() { //default constructor validCharCnt = 0; totalCharCnt = 0; } /** * Constructor with engine properties. The property keys are all * PDF operators, the values are class names used to execute those * operators. An empty value means that the operator will be silently * ignored. * * @param properties The engine properties. * * @throws IOException If there is an error setting the engine properties. */ public PDFStreamEngine( Properties properties ) throws IOException { if( properties == null ) { throw new NullPointerException( "properties cannot be null" ); } Enumeration<?> names = properties.propertyNames(); for ( Object name : Collections.list( names ) ) { String operator = name.toString(); String processorClassName = properties.getProperty( operator ); if( "".equals( processorClassName ) ) { unsupportedOperators.add( operator ); } else { try { Class<?> klass = Class.forName( processorClassName ); OperatorProcessor processor = (OperatorProcessor) klass.newInstance(); registerOperatorProcessor( operator, processor ); } catch( Exception e ) { throw new WrappedIOException( "OperatorProcessor class " + processorClassName + " could not be instantiated", e ); } } } validCharCnt = 0; totalCharCnt = 0; } public boolean isForceParsing() { return forceParsing; } public void setForceParsing(boolean forceParsing) { this.forceParsing = forceParsing; } /** * Register a custom operator processor with the engine. * * @param operator The operator as a string. * @param op Processor instance. */ public void registerOperatorProcessor( String operator, OperatorProcessor op ) { op.setContext( this ); operators.put( operator, op ); } /** * This method must be called between processing documents. The * PDFStreamEngine caches information for the document between pages * and this will release the cached information. This only needs * to be called if processing a new document. * */ public void resetEngine() { documentFontCache.clear(); validCharCnt = 0; totalCharCnt = 0; } /** * This will process the contents of the stream. * * @param aPage The page. * @param resources The location to retrieve resources. * @param cosStream the Stream to execute. * * * @throws IOException if there is an error accessing the stream. */ public void processStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException { graphicsState = new PDGraphicsState(aPage.findCropBox()); textMatrix = null; textLineMatrix = null; graphicsStack.clear(); streamResourcesStack.clear(); processSubStream( aPage, resources, cosStream ); } /** * Process a sub stream of the current stream. * * @param aPage The page used for drawing. * @param resources The resources used when processing the stream. * @param cosStream The stream to process. * * @throws IOException If there is an exception while processing the stream. */ public void processSubStream( PDPage aPage, PDResources resources, COSStream cosStream) throws IOException { page = aPage; if (resources != null) { StreamResources sr = new StreamResources(); sr.fonts = resources.getFonts( documentFontCache ); sr.colorSpaces = resources.getColorSpaces(); sr.xobjects = resources.getXObjects(); sr.graphicsStates = resources.getGraphicsStates(); sr.resources = resources; streamResourcesStack.push(sr); try { processSubStream(cosStream); } finally { streamResourcesStack.pop(); } } else { processSubStream(cosStream); } } private void processSubStream(COSStream cosStream) throws IOException { List<COSBase> arguments = new ArrayList<COSBase>(); PDFStreamParser parser = new PDFStreamParser(cosStream, forceParsing); try { Iterator<Object> iter = parser.getTokenIterator(); while (iter.hasNext()) { Object next = iter.next(); if (log.isDebugEnabled()) { log.debug("processing substream token: " + next); } if (next instanceof COSObject) { arguments.add(((COSObject) next).getObject()); } else if (next instanceof PDFOperator) { processOperator((PDFOperator) next, arguments); arguments = new ArrayList<COSBase>(); } else { arguments.add((COSBase) next); } } } finally { parser.close(); } } /** * A method provided as an event interface to allow a subclass to perform * some specific functionality when text needs to be processed. * * @param text The text to be processed. */ protected void processTextPosition( TextPosition text ) { //subclasses can override to provide specific functionality. } /** * A method provided as an event interface to allow a subclass to perform * some specific functionality on the string encoded by a glyph. * * @param str The string to be processed. */ protected String inspectFontEncoding( String str ) { return str; } /** * Process encoded text from the PDF Stream. * You should override this method if you want to perform an action when * encoded text is being processed. * * @param string The encoded text * * @throws IOException If there is an error processing the string */ public void processEncodedText( byte[] string ) throws IOException { /* Note on variable names. There are three different units being used * in this code. Character sizes are given in glyph units, text locations * are initially given in text units, and we want to save the data in * display units. The variable names should end with Text or Disp to * represent if the values are in text or disp units (no glyph units are saved). */ final float fontSizeText = graphicsState.getTextState().getFontSize(); final float horizontalScalingText = graphicsState.getTextState().getHorizontalScalingPercent()/100f; //float verticalScalingText = horizontalScaling;//not sure if this is right but what else to do??? final float riseText = graphicsState.getTextState().getRise(); final float wordSpacingText = graphicsState.getTextState().getWordSpacing(); final float characterSpacingText = graphicsState.getTextState().getCharacterSpacing(); //We won't know the actual number of characters until //we process the byte data(could be two bytes each) but //it won't ever be more than string.length*2(there are some cases //were a single byte will result in two output characters "fi" final PDFont font = graphicsState.getTextState().getFont(); // all fonts are providing the width/height of a character in thousandths of a unit of text space float fontMatrixXScaling = 1/1000f; float fontMatrixYScaling = 1/1000f; float glyphSpaceToTextSpaceFactor = 1/1000f; // expect Type3 fonts, those are providing the width of a character in glyph space units if (font instanceof PDType3Font) { PDMatrix fontMatrix = font.getFontMatrix(); fontMatrixXScaling = fontMatrix.getValue(0, 0); fontMatrixYScaling = fontMatrix.getValue(1, 1); //This will typically be 1000 but in the case of a type3 font //this might be a different number glyphSpaceToTextSpaceFactor = 1f/fontMatrix.getValue( 0, 0 ); } float spaceWidthText=0; try { // to avoid crash as described in PDFBOX-614 // lets see what the space displacement should be spaceWidthText = (font.getFontWidth( SPACE_BYTES, 0, 1 )*glyphSpaceToTextSpaceFactor); } catch (Throwable exception) { log.warn( exception, exception); } if( spaceWidthText == 0 ) { spaceWidthText = (font.getAverageFontWidth()*glyphSpaceToTextSpaceFactor); //The average space width appears to be higher than necessary //so lets make it a little bit smaller. spaceWidthText *= .80f; } float maxVerticalDisplacementText = 0; Matrix textStateParameters = new Matrix(); textStateParameters.setValue(0,0, fontSizeText*horizontalScalingText); textStateParameters.setValue(1,1, fontSizeText); textStateParameters.setValue(2,1, riseText); int pageRotation = page.findRotation(); float pageHeight = page.findMediaBox().getHeight(); float pageWidth = page.findMediaBox().getWidth(); Matrix ctm = getGraphicsState().getCurrentTransformationMatrix(); Matrix textXctm = new Matrix(); Matrix textMatrixEnd = new Matrix(); Matrix td = new Matrix(); Matrix tempMatrix = new Matrix(); int codeLength = 1; for( int i=0; i<string.length; i+=codeLength) { // Decode the value to a Unicode character codeLength = 1; String c = font.encode( string, i, codeLength ); if( c == null && i+1<string.length) { //maybe a multibyte encoding codeLength++; c = font.encode( string, i, codeLength ); } // the space width has to be transformed into display units float spaceWidthDisp = spaceWidthText * fontSizeText * horizontalScalingText * textMatrix.getValue(0, 0) * ctm.getValue(0, 0); //todo, handle horizontal displacement // get the width and height of this character in text units float characterHorizontalDisplacementText = font.getFontWidth( string, i, codeLength ); float characterVerticalDisplacementText = font.getFontHeight( string, i, codeLength ); // multiply the width/height with the scaling factor characterHorizontalDisplacementText = characterHorizontalDisplacementText * fontMatrixXScaling; characterVerticalDisplacementText = characterVerticalDisplacementText * fontMatrixYScaling; maxVerticalDisplacementText = Math.max( maxVerticalDisplacementText, characterVerticalDisplacementText); // PDF Spec - 5.5.2 Word Spacing // // Word spacing works the same was as character spacing, but applies // only to the space character, code 32. // // Note: Word spacing is applied to every occurrence of the single-byte // character code 32 in a string. This can occur when using a simple // font or a composite font that defines code 32 as a single-byte code. // It does not apply to occurrences of the byte value 32 in multiple-byte // codes. // // RDD - My interpretation of this is that only character code 32's that // encode to spaces should have word spacing applied. Cases have been // observed where a font has a space character with a character code // other than 32, and where word spacing (Tw) was used. In these cases, // applying word spacing to either the non-32 space or to the character // code 32 non-space resulted in errors consistent with this interpretation. // float spacingText = 0; if( (string[i] == 0x20) && codeLength == 1 ) { spacingText += wordSpacingText; } textXctm = textMatrix.multiply(ctm, textXctm); // Convert textMatrix to display units // We need to instantiate a new Matrix instance here as it is passed to the TextPosition constructor below. Matrix textMatrixStart = textStateParameters.multiply(textXctm); // TODO : tx should be set for horizontal text and ty for vertical text // which seems to be specified in the font (not the direction in the matrix). float tx = ((characterHorizontalDisplacementText)*fontSizeText)*horizontalScalingText; float ty = 0; // reset the matrix instead of creating a new one td.reset(); td.setValue( 2, 0, tx ); td.setValue( 2, 1, ty ); // The text matrix gets updated after each glyph is placed. The updated // version will have the X and Y coordinates for the next glyph. // textMatrixEnd contains the coordinates of the end of the last glyph without // taking characterSpacingText and spacintText into account, otherwise it'll be // impossible to detect new words within text extraction tempMatrix = textStateParameters.multiply(td, tempMatrix); textMatrixEnd = tempMatrix.multiply(textXctm, textMatrixEnd); final float endXPosition = textMatrixEnd.getXPosition(); final float endYPosition = textMatrixEnd.getYPosition(); // add some spacing to the text matrix (see comment above) tx = ((characterHorizontalDisplacementText)*fontSizeText+characterSpacingText+spacingText)*horizontalScalingText; td.setValue( 2, 0, tx ); textMatrix = td.multiply(textMatrix, textMatrix ); // determine the width of this character // XXX: Note that if we handled vertical text, we should be using Y here float startXPosition = textMatrixStart.getXPosition(); float widthText = endXPosition - startXPosition; //there are several cases where one character code will //output multiple characters. For example "fi" or a //glyphname that has no mapping like "visiblespace" if( c != null ) { validCharCnt++; } else { // PDFBOX-373: Replace a null entry with "?" so it is // not printed as "(null)" c = "?"; } totalCharCnt++; float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * textMatrix.getYScale(); // process the decoded text processTextPosition( new TextPosition( pageRotation, pageWidth, pageHeight, textMatrixStart, endXPosition, endYPosition, totalVerticalDisplacementDisp, widthText, spaceWidthDisp, c, font, fontSizeText, (int)(fontSizeText * textMatrix.getXScale()) )); } } /** * This is used to handle an operation. * * @param operation The operation to perform. * @param arguments The list of arguments. * * @throws IOException If there is an error processing the operation. */ public void processOperator( String operation, List<COSBase> arguments ) throws IOException { try { PDFOperator oper = PDFOperator.getOperator( operation ); processOperator( oper, arguments ); } catch (IOException e) { log.warn(e, e); } } /** * This is used to handle an operation. * * @param operator The operation to perform. * @param arguments The list of arguments. * * @throws IOException If there is an error processing the operation. */ protected void processOperator( PDFOperator operator, List<COSBase> arguments ) throws IOException { try { String operation = operator.getOperation(); OperatorProcessor processor = (OperatorProcessor)operators.get( operation ); if( processor != null ) { processor.setContext(this); processor.process( operator, arguments ); } else { if (!unsupportedOperators.contains(operation)) { log.info("unsupported/disabled operation: " + operation); unsupportedOperators.add(operation); } } } catch (Exception e) { log.warn(e, e); } } /** * @return Returns the colorSpaces. */ public Map<String,PDColorSpace> getColorSpaces() { return streamResourcesStack.peek().colorSpaces; } /** * @return Returns the colorSpaces. */ public Map<String,PDXObject> getXObjects() { return streamResourcesStack.peek().xobjects; } /** * @param value The colorSpaces to set. */ public void setColorSpaces(Map<String,PDColorSpace> value) { streamResourcesStack.peek().colorSpaces = value; } /** * @return Returns the fonts. */ public Map<String,PDFont> getFonts() { return streamResourcesStack.peek().fonts; } /** * @param value The fonts to set. */ public void setFonts(Map<String,PDFont> value) { streamResourcesStack.peek().fonts = value; } /** * @return Returns the graphicsStack. */ public Stack<PDGraphicsState> getGraphicsStack() { return graphicsStack; } /** * @param value The graphicsStack to set. */ public void setGraphicsStack(Stack<PDGraphicsState> value) { graphicsStack = value; } /** * @return Returns the graphicsState. */ public PDGraphicsState getGraphicsState() { return graphicsState; } /** * @param value The graphicsState to set. */ public void setGraphicsState(PDGraphicsState value) { graphicsState = value; } /** * @return Returns the graphicsStates. */ public Map<String,PDExtendedGraphicsState> getGraphicsStates() { return streamResourcesStack.peek().graphicsStates; } /** * @param value The graphicsStates to set. */ public void setGraphicsStates(Map<String,PDExtendedGraphicsState> value) { ((StreamResources) streamResourcesStack.peek()).graphicsStates = value; } /** * @return Returns the textLineMatrix. */ public Matrix getTextLineMatrix() { return textLineMatrix; } /** * @param value The textLineMatrix to set. */ public void setTextLineMatrix(Matrix value) { textLineMatrix = value; } /** * @return Returns the textMatrix. */ public Matrix getTextMatrix() { return textMatrix; } /** * @param value The textMatrix to set. */ public void setTextMatrix(Matrix value) { textMatrix = value; } /** * @return Returns the resources. */ public PDResources getResources() { return streamResourcesStack.peek().resources; } /** * Get the current page that is being processed. * * @return The page being processed. */ public PDPage getCurrentPage() { return page; } /** * Get the total number of valid characters in the doc * that could be decoded in processEncodedText(). * @return The number of valid characters. */ public int getValidCharCnt() { return validCharCnt; } /** * Get the total number of characters in the doc * (including ones that could not be mapped). * @return The number of characters. */ public int getTotalCharCnt() { return totalCharCnt; } }