/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.pdfparser; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSBoolean; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSNull; import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.io.RandomAccess; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.util.ImageParameters; import org.apache.pdfbox.util.PDFOperator; /** * This will parse a PDF byte stream and extract operands and such. * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.32 $ */ public class PDFStreamParser extends BaseParser { private List<Object> streamObjects = new ArrayList<Object>( 100 ); private RandomAccess file; private PDFOperator lastBIToken = null; /** * Constructor that takes a stream to parse. * * @since Apache PDFBox 1.3.0 * @param stream The stream to read data from. * @param raf The random access file. * @param forceParcing flag to skip malformed or otherwise unparseable * input where possible * @throws IOException If there is an error reading from the stream. */ public PDFStreamParser( InputStream stream, RandomAccess raf, boolean forceParsing) throws IOException { super(stream, forceParsing); file = raf; } /** * Constructor that takes a stream to parse. * * @param stream The stream to read data from. * @param raf The random access file. * * @throws IOException If there is an error reading from the stream. */ public PDFStreamParser(InputStream stream, RandomAccess raf) throws IOException { this(stream, raf, FORCE_PARSING); } /** * Constructor. * * @param stream The stream to parse. * * @throws IOException If there is an error initializing the stream. */ public PDFStreamParser( PDStream stream ) throws IOException { this( stream.createInputStream(), stream.getStream().getScratchFile() ); } /** * Constructor. * * @since Apache PDFBox 1.3.0 * @param stream The stream to parse. * @param forceParcing flag to skip malformed or otherwise unparseable * input where possible * @throws IOException If there is an error initializing the stream. */ public PDFStreamParser(COSStream stream, boolean forceParsing) throws IOException { this(stream.getUnfilteredStream(), stream.getScratchFile(), forceParsing); } /** * Constructor. * * @param stream The stream to parse. * * @throws IOException If there is an error initializing the stream. */ public PDFStreamParser( COSStream stream ) throws IOException { this( stream.getUnfilteredStream(), stream.getScratchFile() ); } /** * This will parse the tokens in the stream. This will close the * stream when it is finished parsing. * * @throws IOException If there is an error while parsing the stream. */ public void parse() throws IOException { try { Object token = null; while( (token = parseNextToken()) != null ) { streamObjects.add( token ); //logger().fine( "parsed=" + token ); } } finally { pdfSource.close(); } } /** * This will get the tokens that were parsed from the stream. * * @return All of the tokens in the stream. */ public List<Object> getTokens() { return streamObjects; } public void close() throws IOException { pdfSource.close(); } /** * This will get an iterator which can be used to parse the stream * one token after the other. * * @return an iterator to get one token after the other */ public Iterator<Object> getTokenIterator() { return new Iterator<Object>() { private Object token; private void tryNext() { try { if (token == null) { token = parseNextToken(); } } catch (IOException e) { throw new RuntimeException(e); } } /** {@inheritDoc} */ public boolean hasNext() { tryNext(); return token != null; } /** {@inheritDoc} */ public Object next() { tryNext(); Object tmp = token; if (tmp == null) { throw new NoSuchElementException(); } token = null; return tmp; } /** {@inheritDoc} */ public void remove() { throw new UnsupportedOperationException(); } }; } /** * This will parse the next token in the stream. * * @return The next token in the stream or null if there are no more tokens in the stream. * * @throws IOException If an io error occurs while parsing the stream. */ private Object parseNextToken() throws IOException { Object retval = null; skipSpaces(); int nextByte = pdfSource.peek(); if( ((byte)nextByte) == -1 ) { return null; } char c = (char)nextByte; switch(c) { case '<': { int leftBracket = pdfSource.read();//pull off first left bracket c = (char)pdfSource.peek(); //check for second left bracket pdfSource.unread( leftBracket ); //put back first bracket if(c == '<') { COSDictionary pod = parseCOSDictionary(); skipSpaces(); if((char)pdfSource.peek() == 's') { retval = parseCOSStream( pod, file ); } else { retval = pod; } } else { retval = parseCOSString(); } break; } case '[': // array { retval = parseCOSArray(); break; } case '(': // string retval = parseCOSString(); break; case '/': // name retval = parseCOSName(); break; case 'n': // null { String nullString = readString(); if( nullString.equals( "null") ) { retval = COSNull.NULL; } else { retval = PDFOperator.getOperator( nullString ); } break; } case 't': case 'f': { String next = readString(); if( next.equals( "true" ) ) { retval = COSBoolean.TRUE; break; } else if( next.equals( "false" ) ) { retval = COSBoolean.FALSE; } else { retval = PDFOperator.getOperator( next ); } break; } case 'R': { String line = readString(); if( line.equals( "R" ) ) { retval = new COSObject( null ); } else { retval = PDFOperator.getOperator( line ); } break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '-': case '+': case '.': { /* We will be filling buf with the rest of the number. Only * allow 1 "." and "-" and "+" at start of number. */ StringBuffer buf = new StringBuffer(); buf.append( c ); pdfSource.read(); boolean dotNotRead = (c != '.'); while( Character.isDigit(( c = (char)pdfSource.peek()) ) || (dotNotRead && (c == '.')) ) { buf.append( c ); pdfSource.read(); if (dotNotRead && (c == '.')) { dotNotRead = false; } } retval = COSNumber.get( buf.toString() ); break; } case 'B': { String next = readString(); retval = PDFOperator.getOperator( next ); if( next.equals( "BI" ) ) { lastBIToken = (PDFOperator)retval; COSDictionary imageParams = new COSDictionary(); lastBIToken.setImageParameters( new ImageParameters( imageParams ) ); Object nextToken = null; while( (nextToken = parseNextToken()) instanceof COSName ) { Object value = parseNextToken(); imageParams.setItem( (COSName)nextToken, (COSBase)value ); } //final token will be the image data, maybe?? PDFOperator imageData = (PDFOperator)nextToken; lastBIToken.setImageData( imageData.getImageData() ); } break; } case 'I': { //ImageParameters imageParams = lastBIToken.getImageParameters(); //int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() * // (imageParams.getBitsPerComponent()/8) ); //Special case for ID operator String id = "" + (char)pdfSource.read() + (char)pdfSource.read(); if( !id.equals( "ID" ) ) { throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" ); } ByteArrayOutputStream imageData = new ByteArrayOutputStream(); //boolean foundEnd = false; if( this.isWhitespace() ) { //pull off the whitespace character pdfSource.read(); } int twoBytesAgo = 0; int lastByte = pdfSource.read(); int currentByte = pdfSource.read(); int count = 0; //PDF spec is kinda unclear about this. Should a whitespace //always appear before EI? Not sure, I found a PDF //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part //of the image data and will stop parsing prematurely if there is //not a check for <whitespace>EI<whitespace>. while( !(isWhitespace( twoBytesAgo ) && lastByte == 'E' && currentByte == 'I' && isWhitespace() //&& //amyuni2_05d__pdf1_3_acro4x.pdf has image data that //is compressed, so expectedBytes is useless here. //count >= expectedBytes ) && !pdfSource.isEOF() ) { imageData.write( lastByte ); twoBytesAgo = lastByte; lastByte = currentByte; currentByte = pdfSource.read(); count++; } pdfSource.unread( 'I' ); //unread the EI operator pdfSource.unread( 'E' ); retval = PDFOperator.getOperator( "ID" ); ((PDFOperator)retval).setImageData( imageData.toByteArray() ); break; } case ']': { // some ']' around without its previous '[' // this means a PDF is somewhat corrupt but we will continue to parse. pdfSource.read(); retval = COSNull.NULL; // must be a better solution than null... break; } default: { //we must be an operator String operator = readOperator(); if( operator.trim().length() == 0 ) { //we have a corrupt stream, stop reading here retval = null; } else { retval = PDFOperator.getOperator( operator ); } } } return retval; } /** * This will read an operator from the stream. * * @return The operator that was read from the stream. * * @throws IOException If there is an error reading from the stream. */ protected String readOperator() throws IOException { skipSpaces(); //average string size is around 2 and the normal string buffer size is //about 16 so lets save some space. StringBuffer buffer = new StringBuffer(4); int nextChar = pdfSource.peek(); while( nextChar != -1 && // EOF !isWhitespace(nextChar) && !isClosing(nextChar) && nextChar != '[' && nextChar != '<' && nextChar != '(' && nextChar != '/' && (nextChar < '0' || nextChar > '9' ) ) { char currentChar = (char)pdfSource.read(); nextChar = pdfSource.peek(); buffer.append( currentChar ); // Type3 Glyph description has operators with a number in the name if (currentChar == 'd' && (nextChar == '0' || nextChar == '1') ) { buffer.append( (char)pdfSource.read() ); nextChar = pdfSource.peek(); } } return buffer.toString(); } }