/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.persistence.util.COSObjectKey;
/**
* This class is used to contain parsing logic that will be used by both the
* PDFParser and the COSStreamParser.
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.61 $
*/
public abstract class BaseParser
{
/**
* Log instance.
*/
private static final Log LOG = LogFactory.getLog(BaseParser.class);
private static final int E = 'e';
private static final int N = 'n';
private static final int D = 'd';
private static final int S = 's';
private static final int T = 't';
private static final int R = 'r';
private static final int A = 'a';
private static final int M = 'm';
private static final int O = 'o';
private static final int B = 'b';
private static final int J = 'j';
private final int strmBufLen = 2048;
private final byte[] strmBuf = new byte[ strmBufLen ];
/**
* This is a byte array that will be used for comparisons.
*/
public static final byte[] ENDSTREAM =
new byte[] { E, N, D, S, T, R, E, A, M };
/**
* This is a byte array that will be used for comparisons.
*/
public static final byte[] ENDOBJ =
new byte[] { E, N, D, O, B, J };
/**
* This is a string constant that will be used for comparisons.
*/
public static final String DEF = "def";
/**
* This is a string constant that will be used for comparisons.
*/
private static final String ENDOBJ_STRING = "endobj";
/**
* This is a string constant that will be used for comparisons.
*/
private static final String ENDSTREAM_STRING = "endstream";
/**
* This is a string constant that will be used for comparisons.
*/
private static final String STREAM_STRING = "stream";
/**
* This is a string constant that will be used for comparisons.
*/
private static final String TRUE = "true";
/**
* This is a string constant that will be used for comparisons.
*/
private static final String FALSE = "false";
/**
* This is a string constant that will be used for comparisons.
*/
private static final String NULL = "null";
/**
* Default value of the {@link #forceParsing} flag.
*/
protected static final boolean FORCE_PARSING =
Boolean.getBoolean("org.apache.pdfbox.forceParsing");
/**
* This is the stream that will be read from.
*/
protected PushBackInputStream pdfSource;
/**
* This is the document that will be parsed.
*/
protected COSDocument document;
/**
* Flag to skip malformed or otherwise unparseable input where possible.
*/
protected final boolean forceParsing;
/**
* Default constructor.
*/
public BaseParser()
{
this.forceParsing = FORCE_PARSING;
}
/**
* Constructor.
*
* @since Apache PDFBox 1.3.0
* @param input The input stream to read the data from.
* @param forceParsingValue flag to skip malformed or otherwise unparseable
* input where possible
* @throws IOException If there is an error reading the input stream.
*/
public BaseParser(InputStream input, boolean forceParsingValue)
throws IOException
{
this.pdfSource = new PushBackInputStream(
new BufferedInputStream(input, 16384), 4096);
this.forceParsing = forceParsingValue;
}
/**
* Constructor.
*
* @param input The input stream to read the data from.
* @throws IOException If there is an error reading the input stream.
*/
public BaseParser(InputStream input) throws IOException
{
this(input, FORCE_PARSING);
}
/**
* Constructor.
*
* @param input The array to read the data from.
* @throws IOException If there is an error reading the byte data.
*/
protected BaseParser(byte[] input) throws IOException
{
this(new ByteArrayInputStream(input));
}
/**
* Set the document for this stream.
*
* @param doc The current document.
*/
public void setDocument( COSDocument doc )
{
document = doc;
}
private static boolean isHexDigit(char ch)
{
return (ch >= '0' && ch <= '9') ||
(ch >= 'a' && ch <= 'f') ||
(ch >= 'A' && ch <= 'F');
// the line below can lead to problems with certain versions of the IBM JIT compiler
// (and is slower anyway)
//return (HEXDIGITS.indexOf(ch) != -1);
}
/**
* This will parse a PDF dictionary value.
*
* @return The parsed Dictionary object.
*
* @throws IOException If there is an error parsing the dictionary object.
*/
private COSBase parseCOSDictionaryValue() throws IOException
{
COSBase retval = null;
COSBase number = parseDirObject();
skipSpaces();
char next = (char)pdfSource.peek();
if( next >= '0' && next <= '9' )
{
COSBase generationNumber = parseDirObject();
skipSpaces();
char r = (char)pdfSource.read();
if( r != 'R' )
{
throw new IOException( "expected='R' actual='" + r + "' " + pdfSource );
}
COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
((COSInteger) generationNumber).intValue());
retval = document.getObjectFromPool(key);
}
else
{
retval = number;
}
return retval;
}
/**
* This will parse a PDF dictionary.
*
* @return The parsed dictionary.
*
* @throws IOException IF there is an error reading the stream.
*/
protected COSDictionary parseCOSDictionary() throws IOException
{
char c = (char)pdfSource.read();
if( c != '<')
{
throw new IOException( "expected='<' actual='" + c + "'" );
}
c = (char)pdfSource.read();
if( c != '<')
{
throw new IOException( "expected='<' actual='" + c + "' " + pdfSource );
}
skipSpaces();
COSDictionary obj = new COSDictionary();
boolean done = false;
while( !done )
{
skipSpaces();
c = (char)pdfSource.peek();
if( c == '>')
{
done = true;
}
else
if(c != '/')
{
//an invalid dictionary, we are expecting
//the key, read until we can recover
LOG.warn("Invalid dictionary, found: '" + c + "' but expected: '/'");
int read = pdfSource.read();
while(read != -1 && read != '/' && read != '>')
{
// in addition to stopping when we find / or >, we also want
// to stop when we find endstream or endobj.
if(read==E)
{
read = pdfSource.read();
if(read==N)
{
read = pdfSource.read();
if(read==D)
{
read = pdfSource.read();
if(read==S)
{
read = pdfSource.read();
if(read==T)
{
read = pdfSource.read();
if(read==R)
{
read = pdfSource.read();
if(read==E)
{
read = pdfSource.read();
if(read==A)
{
read = pdfSource.read();
if(read==M)
{
return obj; // we're done reading this object!
}
}
}
}
}
}
else if(read==O)
{
read = pdfSource.read();
if(read==B)
{
read = pdfSource.read();
if(read==J)
{
return obj; // we're done reading this object!
}
}
}
}
}
}
read = pdfSource.read();
}
if(read != -1)
{
pdfSource.unread(read);
}
else
{
return obj;
}
}
else
{
COSName key = parseCOSName();
COSBase value = parseCOSDictionaryValue();
skipSpaces();
if( ((char)pdfSource.peek()) == 'd' )
{
//if the next string is 'def' then we are parsing a cmap stream
//and want to ignore it, otherwise throw an exception.
String potentialDEF = readString();
if( !potentialDEF.equals( DEF ) )
{
pdfSource.unread( potentialDEF.getBytes("ISO-8859-1") );
}
else
{
skipSpaces();
}
}
if( value == null )
{
LOG.warn("Bad Dictionary Declaration " + pdfSource );
}
else
{
obj.setItem( key, value );
}
}
}
char ch = (char)pdfSource.read();
if( ch != '>' )
{
throw new IOException( "expected='>' actual='" + ch + "'" );
}
ch = (char)pdfSource.read();
if( ch != '>' )
{
throw new IOException( "expected='>' actual='" + ch + "'" );
}
return obj;
}
/**
* This will read a COSStream from the input stream.
*
* @param file The file to write the stream to when reading.
* @param dic The dictionary that goes with this stream.
*
* @return The parsed pdf stream.
*
* @throws IOException If there is an error reading the stream.
*/
protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException
{
COSStream stream = new COSStream( dic, file );
OutputStream out = null;
try
{
String streamString = readString();
//long streamLength;
if (!streamString.equals(STREAM_STRING))
{
throw new IOException("expected='stream' actual='" + streamString + "'");
}
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
int whitespace = pdfSource.read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (whitespace == 0x20)
{
whitespace = pdfSource.read();
}
if( whitespace == 0x0D )
{
whitespace = pdfSource.read();
if( whitespace != 0x0A )
{
pdfSource.unread( whitespace );
//The spec says this is invalid but it happens in the real
//world so we must support it.
}
}
else if (whitespace == 0x0A)
{
//that is fine
}
else
{
//we are in an error.
//but again we will do a lenient parsing and just assume that everything
//is fine
pdfSource.unread( whitespace );
}
/*This needs to be dic.getItem because when we are parsing, the underlying object
* might still be null.
*/
COSBase streamLength = dic.getItem(COSName.LENGTH);
//Need to keep track of the
out = stream.createFilteredStream( streamLength );
String endStream = null;
readUntilEndStream(out);
skipSpaces();
endStream = readString();
if (!endStream.equals(ENDSTREAM_STRING))
{
/*
* Sometimes stream objects don't have an endstream tag so readUntilEndStream(out)
* also can stop on endobj tags. If that's the case we need to make sure to unread
* the endobj so parseObject() can handle that case normally.
*/
if (endStream.startsWith(ENDOBJ_STRING))
{
byte[] endobjarray = endStream.getBytes("ISO-8859-1");
pdfSource.unread(endobjarray);
}
/*
* Some PDF files don't contain a new line after endstream so we
* need to make sure that the next object number is getting read separately
* and not part of the endstream keyword. Ex. Some files would have "endstream8"
* instead of "endstream"
*/
else if(endStream.startsWith(ENDSTREAM_STRING))
{
String extra = endStream.substring(9, endStream.length());
endStream = endStream.substring(0, 9);
byte[] array = extra.getBytes("ISO-8859-1");
pdfSource.unread(array);
}
else
{
/*
* If for some reason we get something else here, Read until we find the next
* "endstream"
*/
readUntilEndStream( out );
endStream = readString();
if( !endStream.equals( ENDSTREAM_STRING ) )
{
throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource);
}
}
}
}
finally
{
if( out != null )
{
out.close();
}
}
return stream;
}
/**
* This method will read through the current stream object until
* we find the keyword "endstream" meaning we're at the end of this
* object. Some pdf files, however, forget to write some endstream tags
* and just close off objects with an "endobj" tag so we have to handle
* this case as well.
*
* This method is optimized using buffered IO and reduced number of
* byte compare operations.
*
* @param out stream we write out to.
*
* @throws IOException
*/
private void readUntilEndStream( final OutputStream out ) throws IOException
{
int bufSize;
int charMatchCount = 0;
byte[] keyw = ENDSTREAM;
final int quickTestOffset = 5; // last character position of shortest keyword ('endobj')
// read next chunk into buffer; already matched chars are added to beginning of buffer
while ( ( bufSize = pdfSource.read( strmBuf, charMatchCount, strmBufLen - charMatchCount ) ) > 0 )
{
bufSize += charMatchCount;
int bIdx = charMatchCount;
int quickTestIdx;
// iterate over buffer, trying to find keyword match
for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++ )
{
// reduce compare operations by first test last character we would have to
// match if current one matches; if it is not a character from keywords
// we can move behind the test character;
// this shortcut is inspired by Boyer–Moore string search algorithm
// and can reduce parsing time by approx. 20%
if ( ( charMatchCount == 0 ) &&
( ( quickTestIdx = bIdx + quickTestOffset ) < maxQuicktestIdx ) )
{
final byte ch = strmBuf[quickTestIdx];
if ( ( ch > 't' ) || ( ch < 'a' ) )
{
// last character we would have to match if current character would match
// is not a character from keywords -> jump behind and start over
bIdx = quickTestIdx;
continue;
}
}
final byte ch = strmBuf[bIdx]; // could be negative - but we only compare to ASCII
if ( ch == keyw[ charMatchCount ] )
{
if ( ++charMatchCount == keyw.length )
{
// match found
bIdx++;
break;
}
}
else
{
if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount ] ) )
{
// maybe ENDSTREAM is missing but we could have ENDOBJ
keyw = ENDOBJ;
charMatchCount++;
}
else
{
// no match; incrementing match start by 1 would be dumb since we already know matched chars
// depending on current char read we may already have beginning of a new match:
// 'e': first char matched;
// 'n': if we are at match position idx 7 we already read 'e' thus 2 chars matched
// for each other char we have to start matching first keyword char beginning with next
// read position
charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount == 7 ) ) ? 2 : 0;
// search again for 'endstream'
keyw = ENDSTREAM;
}
}
} // for
int contentBytes = Math.max( 0, bIdx - charMatchCount );
// write buffer content until first matched char to output stream
if ( contentBytes > 0 )
{
out.write( strmBuf, 0, contentBytes );
}
if ( charMatchCount == keyw.length )
{
// keyword matched; unread matched keyword (endstream/endobj) and following buffered content
pdfSource.unread( strmBuf, contentBytes, bufSize - contentBytes );
break;
}
else
{
// copy matched chars at start of buffer
System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount );
}
} // while
}
/**
* This is really a bug in the Document creators code, but it caused a crash
* in PDFBox, the first bug was in this format:
* /Title ( (5)
* /Creator which was patched in 1 place.
* However it missed the case where the Close Paren was escaped
*
* The second bug was in this format
* /Title (c:\)
* /Producer
*
* This patch moves this code out of the parseCOSString method, so it can be used twice.
*
*
* @param bracesParameter the number of braces currently open.
*
* @return the corrected value of the brace counter
* @throws IOException
*/
private int checkForMissingCloseParen(final int bracesParameter) throws IOException
{
int braces = bracesParameter;
byte[] nextThreeBytes = new byte[3];
int amountRead = pdfSource.read(nextThreeBytes);
//lets handle the special case seen in Bull River Rules and Regulations.pdf
//The dictionary looks like this
// 2 0 obj
// <<
// /Type /Info
// /Creator (PaperPort http://www.scansoft.com)
// /Producer (sspdflib 1.0 http://www.scansoft.com)
// /Title ( (5)
// /Author ()
// /Subject ()
//
// Notice the /Title, the braces are not even but they should
// be. So lets assume that if we encounter an this scenario
// <end_brace><new_line><opening_slash> then that
// means that there is an error in the pdf and assume that
// was the end of the document.
//
if (amountRead == 3)
{
if (( nextThreeBytes[0] == 0x0d // Look for a carriage return
&& nextThreeBytes[1] == 0x0a // Look for a new line
&& nextThreeBytes[2] == 0x2f ) // Look for a slash /
// Add a second case without a new line
|| (nextThreeBytes[0] == 0x0d // Look for a carriage return
&& nextThreeBytes[1] == 0x2f )) // Look for a slash /
{
braces = 0;
}
}
if (amountRead > 0)
{
pdfSource.unread( nextThreeBytes, 0, amountRead );
}
return braces;
}
/**
* This will parse a PDF string.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
protected COSString parseCOSString() throws IOException
{
char nextChar = (char)pdfSource.read();
COSString retval = new COSString();
char openBrace;
char closeBrace;
if( nextChar == '(' )
{
openBrace = '(';
closeBrace = ')';
}
else if( nextChar == '<' )
{
return parseCOSHexString();
}
else
{
throw new IOException( "parseCOSString string should start with '(' or '<' and not '" +
nextChar + "' " + pdfSource );
}
//This is the number of braces read
//
int braces = 1;
int c = pdfSource.read();
while( braces > 0 && c != -1)
{
char ch = (char)c;
int nextc = -2; // not yet read
if(ch == closeBrace)
{
braces--;
braces = checkForMissingCloseParen(braces);
if( braces != 0 )
{
retval.append( ch );
}
}
else if( ch == openBrace )
{
braces++;
retval.append( ch );
}
else if( ch == '\\' )
{
//patched by ram
char next = (char)pdfSource.read();
switch(next)
{
case 'n':
retval.append( '\n' );
break;
case 'r':
retval.append( '\r' );
break;
case 't':
retval.append( '\t' );
break;
case 'b':
retval.append( '\b' );
break;
case 'f':
retval.append( '\f' );
break;
case ')':
// PDFBox 276 /Title (c:\)
braces = checkForMissingCloseParen(braces);
if( braces != 0 )
{
retval.append( next );
}
else
{
retval.append('\\');
}
break;
case '(':
case '\\':
retval.append( next );
break;
case 10:
case 13:
//this is a break in the line so ignore it and the newline and continue
c = pdfSource.read();
while( isEOL(c) && c != -1)
{
c = pdfSource.read();
}
nextc = c;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
StringBuffer octal = new StringBuffer();
octal.append( next );
c = pdfSource.read();
char digit = (char)c;
if( digit >= '0' && digit <= '7' )
{
octal.append( digit );
c = pdfSource.read();
digit = (char)c;
if( digit >= '0' && digit <= '7' )
{
octal.append( digit );
}
else
{
nextc = c;
}
}
else
{
nextc = c;
}
int character = 0;
try
{
character = Integer.parseInt( octal.toString(), 8 );
}
catch( NumberFormatException e )
{
throw new IOException( "Error: Expected octal character, actual='" + octal + "'" );
}
retval.append( character );
break;
}
default:
{
retval.append( '\\' );
retval.append( next );
//another problem with PDF's, sometimes the \ doesn't really
//mean escape like the PDF spec says it does, sometimes is should be literal
//which is what we will assume here.
//throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
}
}
}
else
{
retval.append( ch );
}
if (nextc != -2)
{
c = nextc;
}
else
{
c = pdfSource.read();
}
}
if (c != -1)
{
pdfSource.unread(c);
}
return retval;
}
/**
* This will parse a PDF HEX string with fail fast semantic
* meaning that we stop if a not allowed character is found.
* This is necessary in order to detect malformed input and
* be able to skip to next object start.
*
* We assume starting '<' was already read.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
private final COSString parseCOSHexString() throws IOException
{
final StringBuilder sBuf = new StringBuilder();
while( true )
{
int c = pdfSource.read();
if ( isHexDigit((char)c) )
{
sBuf.append( (char) c );
}
else if ( c == '>' )
{
break;
}
else if ( c < 0 )
{
throw new IOException( "Missing closing bracket for hex string. Reached EOS." );
}
else if ( ( c == ' ' ) || ( c == '\n' ) ||
( c == '\t' ) || ( c == '\r' ) ||
( c == '\b' ) || ( c == '\f' ) )
{
continue;
}
else
{
// character is neither a hex char nor end of string not EOS nor whitespace
throw new IOException( "Not allowed character in hex string; char code: " + c );
}
}
return COSString.createFromHexString( sBuf.toString(), forceParsing );
}
/**
* This will parse a PDF array object.
*
* @return The parsed PDF array.
*
* @throws IOException If there is an error parsing the stream.
*/
protected COSArray parseCOSArray() throws IOException
{
char ch = (char)pdfSource.read();
if( ch != '[')
{
throw new IOException( "expected='[' actual='" + ch + "'" );
}
COSArray po = new COSArray();
COSBase pbo = null;
skipSpaces();
int i = 0;
while( ((i = pdfSource.peek()) > 0) && ((char)i != ']') )
{
pbo = parseDirObject();
if( pbo instanceof COSObject )
{
// We have to check if the expected values are there or not PDFBOX-385
if (po.get(po.size()-1) instanceof COSInteger)
{
COSInteger genNumber = (COSInteger)po.remove( po.size() -1 );
if (po.get(po.size()-1) instanceof COSInteger)
{
COSInteger number = (COSInteger)po.remove( po.size() -1 );
COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue());
pbo = document.getObjectFromPool(key);
}
else
{
// the object reference is somehow wrong
pbo = null;
}
}
else
{
pbo = null;
}
}
if( pbo != null )
{
po.add( pbo );
}
else
{
//it could be a bad object in the array which is just skipped
LOG.warn("Corrupt object reference" );
// This could also be an "endobj" or "endstream" which means we can assume that
// the array has ended.
String isThisTheEnd = readString();
pdfSource.unread(isThisTheEnd.getBytes("ISO-8859-1"));
if(ENDOBJ_STRING.equals(isThisTheEnd) || ENDSTREAM_STRING.equals(isThisTheEnd))
{
return po;
}
}
skipSpaces();
}
pdfSource.read(); //read ']'
skipSpaces();
return po;
}
/**
* Determine if a character terminates a PDF name.
*
* @param ch The character
* @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
*/
protected boolean isEndOfName(char ch)
{
return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<'
|| ch == '[' || ch =='/' || ch ==']' || ch ==')' || ch =='(' ||
ch == -1 //EOF
);
}
/**
* This will parse a PDF name from the stream.
*
* @return The parsed PDF name.
*
* @throws IOException If there is an error reading from the stream.
*/
protected COSName parseCOSName() throws IOException
{
COSName retval = null;
int c = pdfSource.read();
if( (char)c != '/')
{
throw new IOException("expected='/' actual='" + (char)c + "'-" + c + " " + pdfSource );
}
// costruisce il nome
StringBuilder buffer = new StringBuilder();
c = pdfSource.read();
while( c != -1 )
{
char ch = (char)c;
if(ch == '#')
{
char ch1 = (char)pdfSource.read();
char ch2 = (char)pdfSource.read();
// Prior to PDF v1.2, the # was not a special character. Also,
// it has been observed that various PDF tools do not follow the
// spec with respect to the # escape, even though they report
// PDF versions of 1.2 or later. The solution here is that we
// interpret the # as an escape only when it is followed by two
// valid hex digits.
//
if (isHexDigit(ch1) && isHexDigit(ch2))
{
String hex = "" + ch1 + ch2;
try
{
buffer.append( (char) Integer.parseInt(hex, 16));
}
catch (NumberFormatException e)
{
throw new IOException("Error: expected hex number, actual='" + hex + "'");
}
c = pdfSource.read();
}
else
{
pdfSource.unread(ch2);
c = ch1;
buffer.append( ch );
}
}
else if (isEndOfName(ch))
{
break;
}
else
{
buffer.append( ch );
c = pdfSource.read();
}
}
if (c != -1)
{
pdfSource.unread(c);
}
retval = COSName.getPDFName( buffer.toString() );
return retval;
}
/**
* This will parse a boolean object from the stream.
*
* @return The parsed boolean object.
*
* @throws IOException If an IO error occurs during parsing.
*/
protected COSBoolean parseBoolean() throws IOException
{
COSBoolean retval = null;
char c = (char)pdfSource.peek();
if( c == 't' )
{
String trueString = new String( pdfSource.readFully( 4 ), "ISO-8859-1" );
if( !trueString.equals( TRUE ) )
{
throw new IOException( "Error parsing boolean: expected='true' actual='" + trueString + "'" );
}
else
{
retval = COSBoolean.TRUE;
}
}
else if( c == 'f' )
{
String falseString = new String( pdfSource.readFully( 5 ), "ISO-8859-1" );
if( !falseString.equals( FALSE ) )
{
throw new IOException( "Error parsing boolean: expected='true' actual='" + falseString + "'" );
}
else
{
retval = COSBoolean.FALSE;
}
}
else
{
throw new IOException( "Error parsing boolean expected='t or f' actual='" + c + "'" );
}
return retval;
}
/**
* This will parse a directory object from the stream.
*
* @return The parsed object.
*
* @throws IOException If there is an error during parsing.
*/
protected COSBase parseDirObject() throws IOException
{
COSBase retval = null;
skipSpaces();
int nextByte = pdfSource.peek();
char c = (char)nextByte;
switch(c)
{
case '<':
{
int leftBracket = pdfSource.read();//pull off first left bracket
c = (char)pdfSource.peek(); //check for second left bracket
pdfSource.unread( leftBracket );
if(c == '<')
{
retval = parseCOSDictionary();
skipSpaces();
}
else
{
retval = parseCOSString();
}
break;
}
case '[': // array
{
retval = parseCOSArray();
break;
}
case '(':
retval = parseCOSString();
break;
case '/': // name
retval = parseCOSName();
break;
case 'n': // null
{
String nullString = readString();
if( !nullString.equals( NULL) )
{
throw new IOException("Expected='null' actual='" + nullString + "'");
}
retval = COSNull.NULL;
break;
}
case 't':
{
String trueString = new String( pdfSource.readFully(4), "ISO-8859-1" );
if( trueString.equals( TRUE ) )
{
retval = COSBoolean.TRUE;
}
else
{
throw new IOException( "expected true actual='" + trueString + "' " + pdfSource );
}
break;
}
case 'f':
{
String falseString = new String( pdfSource.readFully(5), "ISO-8859-1" );
if( falseString.equals( FALSE ) )
{
retval = COSBoolean.FALSE;
}
else
{
throw new IOException( "expected false actual='" + falseString + "' " + pdfSource );
}
break;
}
case 'R':
pdfSource.read();
retval = new COSObject(null);
break;
case (char)-1:
return null;
default:
{
if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
{
StringBuilder buf = new StringBuilder();
int ic = pdfSource.read();
c = (char)ic;
while( Character.isDigit( c )||
c == '-' ||
c == '+' ||
c == '.' ||
c == 'E' ||
c == 'e' )
{
buf.append( c );
ic = pdfSource.read();
c = (char)ic;
}
if( ic != -1 )
{
pdfSource.unread( ic );
}
retval = COSNumber.get( buf.toString() );
}
else
{
//This is not suppose to happen, but we will allow for it
//so we are more compatible with POS writers that don't
//follow the spec
String badString = readString();
//throw new IOException( "Unknown dir object c='" + c +
//"' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
if( badString == null || badString.length() == 0 )
{
int peek = pdfSource.peek();
// we can end up in an infinite loop otherwise
throw new IOException( "Unknown dir object c='" + c +
"' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " " + pdfSource.getOffset() );
}
// if it's an endstream/endobj, we want to put it back so the caller will see it
if(ENDOBJ_STRING.equals(badString) || ENDSTREAM_STRING.equals(badString))
{
pdfSource.unread(badString.getBytes("ISO-8859-1"));
}
}
}
}
return retval;
}
/**
* This will read the next string from the stream.
*
* @return The string that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected String readString() throws IOException
{
skipSpaces();
StringBuilder buffer = new StringBuilder();
int c = pdfSource.read();
while( !isEndOfName((char)c) && !isClosing(c) && c != -1 )
{
buffer.append( (char)c );
c = pdfSource.read();
}
if (c != -1)
{
pdfSource.unread(c);
}
return buffer.toString();
}
/**
* This will read bytes until the end of line marker occurs.
*
* @param theString The next expected string in the stream.
*
* @return The characters between the current position and the end of the line.
*
* @throws IOException If there is an error reading from the stream or theString does not match what was read.
*/
protected String readExpectedString( String theString ) throws IOException
{
int c = pdfSource.read();
while( isWhitespace(c) && c != -1)
{
c = pdfSource.read();
}
StringBuilder buffer = new StringBuilder( theString.length() );
int charsRead = 0;
while( !isEOL(c) && c != -1 && charsRead < theString.length() )
{
char next = (char)c;
buffer.append( next );
if( theString.charAt( charsRead ) == next )
{
charsRead++;
}
else
{
pdfSource.unread(buffer.toString().getBytes("ISO-8859-1"));
throw new IOException( "Error: Expected to read '" + theString +
"' instead started reading '" +buffer.toString() + "'" );
}
c = pdfSource.read();
}
while( isEOL(c) && c != -1 )
{
c = pdfSource.read();
}
if (c != -1)
{
pdfSource.unread(c);
}
return buffer.toString();
}
/**
* This will read the next string from the stream up to a certain length.
*
* @param length The length to stop reading at.
*
* @return The string that was read from the stream of length 0 to length.
*
* @throws IOException If there is an error reading from the stream.
*/
protected String readString( int length ) throws IOException
{
skipSpaces();
int c = pdfSource.read();
//average string size is around 2 and the normal string buffer size is
//about 16 so lets save some space.
StringBuilder buffer = new StringBuilder(length);
while( !isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length &&
c != '[' &&
c != '<' &&
c != '(' &&
c != '/' )
{
buffer.append( (char)c );
c = pdfSource.read();
}
if (c != -1)
{
pdfSource.unread(c);
}
return buffer.toString();
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @return true if the next byte is ']', false otherwise.
*
* @throws IOException If an IO error occurs.
*/
protected boolean isClosing() throws IOException
{
return isClosing(pdfSource.peek());
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @param c The character to check against end of line
* @return true if the next byte is ']', false otherwise.
*/
protected boolean isClosing(int c)
{
return c == ']';
}
/**
* This will read bytes until the first end of line marker occurs.
* Note: if you later unread the results of this function, you'll
* need to add a newline character to the end of the string.
*
* @return The characters between the current position and the end of the line.
*
* @throws IOException If there is an error reading from the stream.
*/
protected String readLine() throws IOException
{
if (pdfSource.isEOF())
{
throw new IOException( "Error: End-of-File, expected line");
}
StringBuilder buffer = new StringBuilder( 11 );
int c;
while ((c = pdfSource.read()) != -1)
{
if (isEOL(c))
{
break;
}
buffer.append( (char)c );
}
return buffer.toString();
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @return true if the next byte is 0x0A or 0x0D.
*
* @throws IOException If there is an error reading from the stream.
*/
protected boolean isEOL() throws IOException
{
return isEOL(pdfSource.peek());
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @param c The character to check against end of line
* @return true if the next byte is 0x0A or 0x0D.
*/
protected boolean isEOL(int c)
{
return c == 10 || c == 13;
}
/**
* This will tell if the next byte is whitespace or not.
*
* @return true if the next byte in the stream is a whitespace character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected boolean isWhitespace() throws IOException
{
return isWhitespace( pdfSource.peek() );
}
/**
* This will tell if the next byte is whitespace or not. These values are
* specified in table 1 (page 12) of ISO 32000-1:2008.
* @param c The character to check against whitespace
* @return true if the next byte in the stream is a whitespace character.
*/
protected boolean isWhitespace( int c )
{
return c == 0 || c == 9 || c == 12 || c == 10
|| c == 13 || c == 32;
}
/**
* This will skip all spaces and comments that are present.
*
* @throws IOException If there is an error reading from the stream.
*/
protected void skipSpaces() throws IOException
{
//log( "skipSpaces() " + pdfSource );
int c = pdfSource.read();
// identical to, but faster as: isWhiteSpace(c) || c == 37
while(c == 0 || c == 9 || c == 12 || c == 10
|| c == 13 || c == 32 || c == 37)//37 is the % character, a comment
{
if ( c == 37 )
{
// skip past the comment section
c = pdfSource.read();
while(!isEOL(c) && c != -1)
{
c = pdfSource.read();
}
}
else
{
c = pdfSource.read();
}
}
if (c != -1)
{
pdfSource.unread(c);
}
//log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" );
}
/**
* This will read an integer from the stream.
*
* @return The integer that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected int readInt() throws IOException
{
skipSpaces();
int retval = 0;
int lastByte = 0;
StringBuffer intBuffer = new StringBuffer();
while( (lastByte = pdfSource.read() ) != 32 &&
lastByte != 10 &&
lastByte != 13 &&
lastByte != 60 && //see sourceforge bug 1714707
lastByte != 0 && //See sourceforge bug 853328
lastByte != -1 )
{
intBuffer.append( (char)lastByte );
}
if( lastByte != -1 )
{
pdfSource.unread( lastByte );
}
try
{
retval = Integer.parseInt( intBuffer.toString() );
}
catch( NumberFormatException e )
{
pdfSource.unread(intBuffer.toString().getBytes("ISO-8859-1"));
throw new IOException( "Error: Expected an integer type, actual='" + intBuffer + "'" );
}
return retval;
}
}