/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.util; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Stack; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject; /** * This is an stream engine to extract the marked content of a pdf. * @author koch * @version $Revision$ */ public class PDFMarkedContentExtractor extends PDFStreamEngine { private boolean suppressDuplicateOverlappingText = true; private List<PDMarkedContent> markedContents = new ArrayList<PDMarkedContent>(); private Stack<PDMarkedContent> currentMarkedContents = new Stack<PDMarkedContent>(); private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>(); /** * encoding that text will be written in (or null). */ protected String outputEncoding; /** * The normalizer is used to remove text ligatures/presentation forms * and to correct the direction of right to left text, such as Arabic and Hebrew. */ private TextNormalize normalize = null; /** * Instantiate a new PDFTextStripper object. This object will load * properties from PDFMarkedContentExtractor.properties and will not * do anything special to convert the text to a more encoding-specific * output. * * @throws IOException If there is an error loading the properties. */ public PDFMarkedContentExtractor() throws IOException { super( ResourceLoader.loadProperties( "org/apache/pdfbox/resources/PDFMarkedContentExtractor.properties", true ) ); this.outputEncoding = null; this.normalize = new TextNormalize(this.outputEncoding); } /** * Instantiate a new PDFTextStripper object. Loading all of the operator mappings * from the properties object that is passed in. Does not convert the text * to more encoding-specific output. * * @param props The properties containing the mapping of operators to PDFOperator * classes. * * @throws IOException If there is an error reading the properties. */ public PDFMarkedContentExtractor( Properties props ) throws IOException { super( props ); this.outputEncoding = null; this.normalize = new TextNormalize(this.outputEncoding); } /** * Instantiate a new PDFTextStripper object. This object will load * properties from PDFMarkedContentExtractor.properties and will apply * encoding-specific conversions to the output text. * * @param encoding The encoding that the output will be written in. * @throws IOException If there is an error reading the properties. */ public PDFMarkedContentExtractor( String encoding ) throws IOException { super( ResourceLoader.loadProperties( "org/apache/pdfbox/resources/PDFMarkedContentExtractor.properties", true )); this.outputEncoding = encoding; this.normalize = new TextNormalize(this.outputEncoding); } /** * This will determine of two floating point numbers are within a specified variance. * * @param first The first number to compare to. * @param second The second number to compare to. * @param variance The allowed variance. */ private boolean within( float first, float second, float variance ) { return second > first - variance && second < first + variance; } public void beginMarkedContentSequence(COSName tag, COSDictionary properties) { PDMarkedContent markedContent = PDMarkedContent.create(tag, properties); if (this.currentMarkedContents.isEmpty()) { this.markedContents.add(markedContent); } else { PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek(); if (currentMarkedContent != null) { currentMarkedContent.addMarkedContent(markedContent); } } this.currentMarkedContents.push(markedContent); } public void endMarkedContentSequence() { if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.pop(); } } public void xobject(PDXObject xobject) { if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.peek().addXObject(xobject); } } /** * This will process a TextPosition object and add the * text to the list of characters on a page. It takes care of * overlapping text. * * @param text The text to process. */ protected void processTextPosition( TextPosition text ) { boolean showCharacter = true; if( this.suppressDuplicateOverlappingText ) { showCharacter = false; String textCharacter = text.getCharacter(); float textX = text.getX(); float textY = text.getY(); List<TextPosition> sameTextCharacters = this.characterListMapping.get( textCharacter ); if( sameTextCharacters == null ) { sameTextCharacters = new ArrayList<TextPosition>(); this.characterListMapping.put( textCharacter, sameTextCharacters ); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). // boolean suppressCharacter = false; float tolerance = (text.getWidth()/textCharacter.length())/3.0f; for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ ) { TextPosition character = (TextPosition)sameTextCharacters.get( i ); String charCharacter = character.getCharacter(); float charX = character.getX(); float charY = character.getY(); //only want to suppress if( charCharacter != null && //charCharacter.equals( textCharacter ) && within( charX, textX, tolerance ) && within( charY, textY, tolerance ) ) { suppressCharacter = true; } } if( !suppressCharacter ) { sameTextCharacters.add( text ); showCharacter = true; } } if( showCharacter ) { List<TextPosition> textList = new ArrayList<TextPosition>(); /* In the wild, some PDF encoded documents put diacritics (accents on * top of characters) into a separate Tj element. When displaying them * graphically, the two chunks get overlayed. With text output though, * we need to do the overlay. This code recombines the diacritic with * its associated character if the two are consecutive. */ if(textList.isEmpty()) { textList.add(text); } else { /* test if we overlap the previous entry. * Note that we are making an assumption that we need to only look back * one TextPosition to find what we are overlapping. * This may not always be true. */ TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1); if(text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text, this.normalize); } /* If the previous TextPosition was the diacritic, merge it into this * one and remove it from the list. */ else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition, this.normalize); textList.remove(textList.size()-1); textList.add(text); } else { textList.add(text); } } if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.peek().addText(text); } } } public List<PDMarkedContent> getMarkedContents() { return this.markedContents; } }