/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.pdfparser; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; /** * This will parse a PDF 1.5 object stream and extract all of the objects from the stream. * * @author Ben Litchfield * */ public class PDFObjectStreamParser extends BaseParser { /** * Log instance. */ private static final Log LOG = LogFactory.getLog(PDFObjectStreamParser.class); private List<COSObject> streamObjects = null; private final COSStream stream; /** * Constructor. * * @param stream The stream to parse. * @param document The document for the current parsing. * @throws IOException If there is an error initializing the stream. */ public PDFObjectStreamParser(COSStream stream, COSDocument document) throws IOException { super(new InputStreamSource(stream.createInputStream())); this.stream = stream; this.document = document; } /** * This will parse the tokens in the stream. This will close the * stream when it is finished parsing. * * @throws IOException If there is an error while parsing the stream. */ public void parse() throws IOException { try { //need to first parse the header. int numberOfObjects = stream.getInt( "N" ); List<Long> objectNumbers = new ArrayList<>( numberOfObjects ); streamObjects = new ArrayList<>( numberOfObjects ); for( int i=0; i<numberOfObjects; i++ ) { long objectNumber = readObjectNumber(); // skip offset readLong(); objectNumbers.add( objectNumber); } COSObject object; COSBase cosObject; int objectCounter = 0; while( (cosObject = parseDirObject()) != null ) { object = new COSObject(cosObject); object.setGenerationNumber(0); if (objectCounter >= objectNumbers.size()) { LOG.error("/ObjStm (object stream) has more objects than /N " + numberOfObjects); break; } object.setObjectNumber( objectNumbers.get( objectCounter) ); streamObjects.add( object ); if(LOG.isDebugEnabled()) { LOG.debug( "parsed=" + object ); } // According to the spec objects within an object stream shall not be enclosed // by obj/endobj tags, but there are some pdfs in the wild using those tags // skip endobject marker if present if (!seqSource.isEOF() && seqSource.peek() == 'e') { readLine(); } objectCounter++; } } finally { seqSource.close(); } } /** * This will get the objects that were parsed from the stream. * * @return All of the objects in the stream. */ public List<COSObject> getObjects() { return streamObjects; } }