/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.pdfparser; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.persistence.util.COSObjectKey; /** * This class will collect all XRef/trailer objects and creates correct * xref/trailer information after all objects are read using startxref * and 'Prev' information (unused XRef/trailer objects are discarded). * * In case of missing startxref or wrong startxref pointer all * XRef/trailer objects are used to create xref table / trailer dictionary * in order they occur. * * For each new xref object/XRef stream method {@link #nextXrefObj(int)} * must be called with start byte position. All following calls to * {@link #setXRef(COSObjectKey, int)} or {@link #setTrailer(COSDictionary)} * will add the data for this byte position. * * After all objects are parsed the startxref position must be provided * using {@link #setStartxref(int)}. This is used to build the chain of * active xref/trailer objects used for creating document trailer and xref table. * * @author Timo Böhme (timo.boehme at ontochem.com) */ public class XrefTrailerResolver { /** * A class which represents a xref/trailer object. */ private class XrefTrailerObj { private COSDictionary trailer = null; private final Map<COSObjectKey, Long> xrefTable = new HashMap<COSObjectKey, Long>(); /** * Default cosntructor. */ private XrefTrailerObj() { } } private final Map<Long, XrefTrailerObj> bytePosToXrefMap = new HashMap<Long, XrefTrailerObj>(); private XrefTrailerObj curXrefTrailerObj = null; private XrefTrailerObj resolvedXrefTrailer = null; /** Log instance. */ private static final Log LOG = LogFactory.getLog( XrefTrailerResolver.class ); /** * Signals that a new XRef object (table or stream) starts. * @param startBytePos the offset to start at * */ public void nextXrefObj( final long startBytePos ) { bytePosToXrefMap.put( startBytePos, curXrefTrailerObj = new XrefTrailerObj() ); } /** * Populate XRef HashMap of current XRef object. * Will add an Xreftable entry that maps ObjectKeys to byte offsets in the file. * @param objKey The objkey, with id and gen numbers * @param offset The byte offset in this file */ public void setXRef( COSObjectKey objKey, long offset ) { if ( curXrefTrailerObj == null ) { // should not happen... LOG.warn( "Cannot add XRef entry for '" + objKey.getNumber() + "' because XRef start was not signalled." ); return; } curXrefTrailerObj.xrefTable.put( objKey, offset ); } /** * Adds trailer information for current XRef object. * * @param trailer the current document trailer dictionary */ public void setTrailer( COSDictionary trailer ) { if ( curXrefTrailerObj == null ) { // should not happen... LOG.warn( "Cannot add trailer because XRef start was not signalled." ); return; } curXrefTrailerObj.trailer = trailer; } /** * Sets the byte position of the first XRef * (has to be called after very last startxref was read). * This is used to resolve chain of active XRef/trailer. * * In case startxref position is not found we output a * warning and use all XRef/trailer objects combined * in byte position order. * Thus for incomplete PDF documents with missing * startxref one could call this method with parameter value -1. */ public void setStartxref( long startxrefBytePosValue ) { if ( resolvedXrefTrailer != null ) { LOG.warn( "Method must be called only ones with last startxref value." ); return; } resolvedXrefTrailer = new XrefTrailerObj(); resolvedXrefTrailer.trailer = new COSDictionary(); XrefTrailerObj curObj = bytePosToXrefMap.get( startxrefBytePosValue ); List<Long> xrefSeqBytePos = new ArrayList<Long>(); if ( curObj == null ) { // no XRef at given position LOG.warn( "Did not found XRef object at specified startxref position " + startxrefBytePosValue ); // use all objects in byte position order (last entries overwrite previous ones) xrefSeqBytePos.addAll( bytePosToXrefMap.keySet() ); Collections.sort( xrefSeqBytePos ); } else { // found starting Xref object // add this and follow chain defined by 'Prev' keys xrefSeqBytePos.add( startxrefBytePosValue ); while ( curObj.trailer != null ) { long prevBytePos = curObj.trailer.getLong( COSName.PREV, -1L ); if ( prevBytePos == -1 ) { break; } curObj = bytePosToXrefMap.get( prevBytePos ); if ( curObj == null ) { LOG.warn( "Did not found XRef object pointed to by 'Prev' key at position " + prevBytePos ); break; } xrefSeqBytePos.add( prevBytePos ); // sanity check to prevent infinite loops if ( xrefSeqBytePos.size() >= bytePosToXrefMap.size() ) { break; } } // have to reverse order so that later XRefs will overwrite previous ones Collections.reverse( xrefSeqBytePos ); } // merge used and sorted XRef/trailer for ( Long bPos : xrefSeqBytePos ) { curObj = bytePosToXrefMap.get( bPos ); if ( curObj.trailer != null ) { resolvedXrefTrailer.trailer.addAll( curObj.trailer ); } resolvedXrefTrailer.xrefTable.putAll( curObj.xrefTable ); } } /** * Gets the resolved trailer. Might return <code>null</code> in case * {@link #setStartxref(int)} was not called before. * * @return the trailer if available */ public COSDictionary getTrailer() { return ( resolvedXrefTrailer == null ) ? null : resolvedXrefTrailer.trailer; } /** * Gets the resolved xref table. Might return <code>null</code> in case * {@link #setStartxref(int)} was not called before. * * @return the xrefTable if available */ public Map<COSObjectKey, Long> getXrefTable() { return ( resolvedXrefTrailer == null ) ? null : resolvedXrefTrailer.xrefTable; } }