BaseParser.java example

Explorer
PdfBox-Android-master
- library
  - src
- sample
  - src
    - main
      - java
        com
        tom_roush
        pdfbox
        sample
        MainActivity.java
package com.tom_roush.pdfbox.pdfparser;

import android.util.Log;

import com.tom_roush.pdfbox.cos.COSArray;
import com.tom_roush.pdfbox.cos.COSBase;
import com.tom_roush.pdfbox.cos.COSBoolean;
import com.tom_roush.pdfbox.cos.COSDictionary;
import com.tom_roush.pdfbox.cos.COSDocument;
import com.tom_roush.pdfbox.cos.COSInteger;
import com.tom_roush.pdfbox.cos.COSName;
import com.tom_roush.pdfbox.cos.COSNull;
import com.tom_roush.pdfbox.cos.COSNumber;
import com.tom_roush.pdfbox.cos.COSObject;
import com.tom_roush.pdfbox.cos.COSObjectKey;
import com.tom_roush.pdfbox.cos.COSString;
import com.tom_roush.pdfbox.util.Charsets;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Arrays;

import static com.tom_roush.pdfbox.util.Charsets.ISO_8859_1;

/**
 * This class is used to contain parsing logic that will be used by both the
 * PDFParser and the COSStreamParser.
 *
 * @author Ben Litchfield
 */
public abstract class BaseParser
{
	private static final long OBJECT_NUMBER_THRESHOLD = 10000000000L;

	private static final long GENERATION_NUMBER_THRESHOLD = 65535;

    protected static final int E = 'e';
    protected static final int N = 'n';
    protected static final int D = 'd';

    protected static final int S = 's';
    protected static final int T = 't';
    protected static final int R = 'r';
    protected static final int A = 'a';
    protected static final int M = 'm';

    protected static final int O = 'o';
    protected static final int B = 'b';
    protected static final int J = 'j';

	/**
	 * This is a string constant that will be used for comparisons.
	 */
	public static final String DEF = "def";
	/**
	 * This is a string constant that will be used for comparisons.
	 */
	protected static final String ENDOBJ_STRING = "endobj";
	/**
	 * This is a string constant that will be used for comparisons.
	 */
	protected static final String ENDSTREAM_STRING = "endstream";
	/**
	 * This is a string constant that will be used for comparisons.
	 */
	protected static final String STREAM_STRING = "stream";
	/**
	 * This is a string constant that will be used for comparisons.
	 */
	private static final String TRUE = "true";
	/**
	 * This is a string constant that will be used for comparisons.
	 */
	private static final String FALSE = "false";
	/**
	 * This is a string constant that will be used for comparisons.
	 */
	private static final String NULL = "null";

	/**
	 * ASCII code for line feed.
	 */
	protected static final byte ASCII_LF = 10;
	/**
	 * ASCII code for carriage return.
	 */
	protected static final byte ASCII_CR = 13;
	private static final byte ASCII_ZERO = 48;
	private static final byte ASCII_NINE = 57;
	private static final byte ASCII_SPACE = 32;

	/**
	 * This is the stream that will be read from.
	 */
    protected final SequentialSource seqSource;

	/**
	 * This is the document that will be parsed.
	 */
	protected COSDocument document;

    /**
     * Default constructor.
     */
    public BaseParser(SequentialSource pdfSource)
    {
        this.seqSource = pdfSource;
    }

	private static boolean isHexDigit(char ch)
	{
		return isDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
	}

	/**
	 * This will parse a PDF dictionary value.
	 *
	 * @return The parsed Dictionary object.
	 *
	 * @throws IOException If there is an error parsing the dictionary object.
	 */
	private COSBase parseCOSDictionaryValue() throws IOException
	{
        long numOffset = seqSource.getPosition();
        COSBase number = parseDirObject();
        skipSpaces();
        if (!isDigit())
        {
            return number;
        }
        long genOffset = seqSource.getPosition();
        COSBase generationNumber = parseDirObject();
        skipSpaces();
        readExpectedChar('R');
        if (!(number instanceof COSInteger))
        {
            throw new IOException("expected number, actual=" + number + " at offset " + numOffset);
		}
		if (!(generationNumber instanceof COSInteger))
		{
			throw new IOException("expected number, actual=" + number + " at offset " + genOffset);
		}
		COSObjectKey key = new COSObjectKey(((COSInteger) number).longValue(),
			((COSInteger) generationNumber).intValue());
		return getObjectFromPool(key);
	}

	private COSBase getObjectFromPool(COSObjectKey key) throws IOException
	{
		if (document == null)
		{
            throw new IOException(
                "object reference " + key + " at offset " + seqSource.getPosition()
                    + " in content stream");
        }
        return document.getObjectFromPool(key);
    }

	/**
	 * This will parse a PDF dictionary.
	 *
	 * @return The parsed dictionary.
	 *
	 * @throws IOException If there is an error reading the stream.
	 */
	protected COSDictionary parseCOSDictionary() throws IOException
	{
		readExpectedChar('<');
		readExpectedChar('<');
		skipSpaces();
		COSDictionary obj = new COSDictionary();
		boolean done = false;
		while (!done)
		{
			skipSpaces();
            char c = (char) seqSource.peek();
            if (c == '>')
            {
                done = true;
            }
            else if (c == '/')
            {
                parseCOSDictionaryNameValuePair(obj);
            }
            else
            {
                // invalid dictionary, we were expecting a /Name, read until the end or until we can recover
                Log.w("PdfBox-Android", "Invalid dictionary, found: '" + c + "' but expected: '/'");
                if (readUntilEndOfCOSDictionary())
                {
                    // we couldn't recover
                    return obj;
                }
            }
        }
        readExpectedChar('>');
        readExpectedChar('>');
        return obj;
    }

	/**
	 * Keep reading until the end of the dictionary object or the file has been hit, or until a '/'
	 * has been found.
	 *
	 * @return true if the end of the object or the file has been found, false if not, i.e. that the
	 * caller can continue to parse the dictionary at the current position.
	 * @throws IOException if there is a reading error.
	 */
	private boolean readUntilEndOfCOSDictionary() throws IOException
	{
        int c = seqSource.read();
        while (c != -1 && c != '/' && c != '>')
        {
            // in addition to stopping when we find / or >, we also want
            // to stop when we find endstream or endobj.
            if (c == E)
            {
                c = seqSource.read();
                if (c == N)
                {
                    c = seqSource.read();
                    if (c == D)
                    {
                        c = seqSource.read();
                        boolean isStream = c == S && seqSource.read() == T && seqSource.read() == R
                            && seqSource.read() == E && seqSource.read() == A &&
                            seqSource.read() == M;
                        boolean isObj =
                            !isStream && c == O && seqSource.read() == B && seqSource.read() == J;
                        if (isStream || isObj)
                        {
                            // we're done reading this object!
                            return true;
                        }
                    }
                }
            }
            c = seqSource.read();
        }
        if (c == -1)
        {
            return true;
        }
        seqSource.unread(c);
        return false;
    }

	private void parseCOSDictionaryNameValuePair(COSDictionary obj) throws IOException
	{
		COSName key = parseCOSName();
		COSBase value = parseCOSDictionaryValue();
		skipSpaces();
        if (((char) seqSource.peek()) == 'd')
        {
            // if the next string is 'def' then we are parsing a cmap stream
            // and want to ignore it, otherwise throw an exception.
            String potentialDEF = readString();
            if (!potentialDEF.equals(DEF))
            {
                seqSource.unread(potentialDEF.getBytes(ISO_8859_1));
            }
            else
            {
                skipSpaces();
            }
        }

		if (value == null)
		{
            Log.w("PdfBox-Android", "Bad Dictionary Declaration " + seqSource);
        }
        else
        {
            value.setDirect(true);
            obj.setItem(key, value);
        }
    }

    protected void skipWhiteSpace() throws IOException
    {
        //PDF Ref 3.2.7 A stream must be followed by either
        //a CRLF or LF but nothing else.

        int whitespace = seqSource.read();

        //see brother_scan_cover.pdf, it adds whitespaces
        //after the stream but before the start of the
        //data, so just read those first
        while (ASCII_SPACE == whitespace)
        {
            whitespace = seqSource.read();
        }

        if (ASCII_CR == whitespace)
        {
            whitespace = seqSource.read();
            if (ASCII_LF != whitespace)
            {
                seqSource.unread(whitespace);
                //The spec says this is invalid but it happens in the real
                //world so we must support it.
            }
        }
        else if (ASCII_LF == whitespace)
        {
            //that is fine
        }
        else
        {
            //we are in an error.
            //but again we will do a lenient parsing and just assume that everything
            //is fine
            seqSource.unread(whitespace);
        }
    }

	/**
	 * This is really a bug in the Document creators code, but it caused a crash
	 * in PDFBox, the first bug was in this format:
	 * /Title ( (5)
	 * /Creator which was patched in 1 place.
	 * However it missed the case where the Close Paren was escaped
	 *
	 * The second bug was in this format
	 * /Title (c:\)
	 * /Producer
	 *
	 * This patch  moves this code out of the parseCOSString method, so it can be used twice.
	 *
	 *
	 * @param bracesParameter the number of braces currently open.
	 *
	 * @return the corrected value of the brace counter
	 * @throws IOException
	 */
	private int checkForMissingCloseParen(final int bracesParameter) throws IOException
	{
		int braces = bracesParameter;
		byte[] nextThreeBytes = new byte[3];
        int amountRead = seqSource.read(nextThreeBytes);

		//lets handle the special case seen in Bull  River Rules and Regulations.pdf
		//The dictionary looks like this
		//    2 0 obj
		//    <<
		//        /Type /Info
		//        /Creator (PaperPort http://www.scansoft.com)
		//        /Producer (sspdflib 1.0 http://www.scansoft.com)
		//        /Title ( (5)
		//        /Author ()
		//        /Subject ()
		//
		// Notice the /Title, the braces are not even but they should
		// be.  So lets assume that if we encounter an this scenario
		//   <end_brace><new_line><opening_slash> then that
		// means that there is an error in the pdf and assume that
		// was the end of the document.
		//
		if (amountRead == 3 &&
				( nextThreeBytes[0] == ASCII_CR        // Look for a carriage return
				&& nextThreeBytes[1] == ASCII_LF   // Look for a new line
				&& nextThreeBytes[2] == 0x2f ) // Look for a slash /
				// Add a second case without a new line
				|| (nextThreeBytes[0] == ASCII_CR  // Look for a carriage return
				&& nextThreeBytes[1] == 0x2f ))  // Look for a slash /
		{
			braces = 0;
		}
		if (amountRead > 0)
		{
            seqSource.unread(Arrays.copyOfRange(nextThreeBytes, 0, amountRead));
        }
        return braces;
    }

	/**
	 * This will parse a PDF string.
	 *
	 * @return The parsed PDF string.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected COSString parseCOSString() throws IOException
	{
        char nextChar = (char) seqSource.read();
        char openBrace;
        char closeBrace;
        if (nextChar == '(')
        {
            openBrace = '(';
            closeBrace = ')';
        }
        else if (nextChar == '<')
        {
            return parseCOSHexString();
        }
        else
        {
            throw new IOException("parseCOSString string should start with '(' or '<' and not '" +
                nextChar + "' " + seqSource);
        }

		ByteArrayOutputStream out = new ByteArrayOutputStream();

		//This is the number of braces read
		//
		int braces = 1;
        int c = seqSource.read();
        while (braces > 0 && c != -1)
        {
            char ch = (char) c;
            int nextc = -2; // not yet read

			if(ch == closeBrace)
			{

				braces--;
				braces = checkForMissingCloseParen(braces);
				if( braces != 0 )
				{
					out.write(ch);
				}
			}
			else if( ch == openBrace )
			{
				braces++;
				out.write(ch);
			}
			else if( ch == '\\' )
			{
				//patched by ram
                char next = (char) seqSource.read();
                switch (next)
                {
                    case 'n':
                        out.write('\n');
                        break;
                    case 'r':
                        out.write('\r');
                        break;
                    case 't':
                        out.write('\t');
                        break;
                    case 'b':
                        out.write('\b');
                        break;
                    case 'f':
                        out.write('\f');
                        break;
                    case ')':
                        // PDFBox 276 /Title (c:\)
                        braces = checkForMissingCloseParen(braces);
                        if (braces != 0)
                        {
                            out.write(next);
                        }
                        else
                        {
                            out.write('\\');
                        }
                        break;
                    case '(':
                    case '\\':
                        out.write(next);
                        break;
                    case ASCII_LF:
                    case ASCII_CR:
                        //this is a break in the line so ignore it and the newline and continue
                        c = seqSource.read();
                        while (isEOL(c) && c != -1)
                        {
                            c = seqSource.read();
                        }
                        nextc = c;
                        break;
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    {
                        StringBuffer octal = new StringBuffer();
                        octal.append(next);
                        c = seqSource.read();
                        char digit = (char) c;
                        if (digit >= '0' && digit <= '7')
                        {
                            octal.append(digit);
                            c = seqSource.read();
                            digit = (char) c;
                            if (digit >= '0' && digit <= '7')
                            {
                                octal.append(digit);
                            }
                            else
                            {
                                nextc = c;
                            }
                        }
                        else
                        {
                            nextc = c;
                        }

					int character = 0;
					try
					{
						character = Integer.parseInt( octal.toString(), 8 );
					}
					catch( NumberFormatException e )
					{
						throw new IOException( "Error: Expected octal character, actual='" + octal + "'", e );
					}
					out.write(character);
					break;
				}
				default:
				{
					// dropping the backslash
					// see 7.3.4.2 Literal Strings for further information
					out.write(next);
				}
				}
			}
			else
			{
				out.write(ch);
			}
			if (nextc != -2)
			{
				c = nextc;
			}
			else
			{
                c = seqSource.read();
            }
        }
        if (c != -1)
        {
            seqSource.unread(c);
        }
        return new COSString(out.toByteArray());
    }

	/**
	 * This will parse a PDF HEX string with fail fast semantic
	 * meaning that we stop if a not allowed character is found.
	 * This is necessary in order to detect malformed input and
	 * be able to skip to next object start.
	 *
	 * We assume starting '<' was already read.
	 *
	 * @return The parsed PDF string.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	private COSString parseCOSHexString() throws IOException
	{
		final StringBuilder sBuf = new StringBuilder();
		while( true )
		{
            int c = seqSource.read();
            if (isHexDigit((char) c))
            {
                sBuf.append((char) c);
            }
            else if (c == '>')
            {
                break;
            }
            else if (c < 0)
            {
                throw new IOException("Missing closing bracket for hex string. Reached EOS.");
            }
            else if ((c == ' ') || (c == '\n') ||
                (c == '\t') || (c == '\r') ||
                (c == '\b') || (c == '\f'))
            {
                continue;
            }
            else
            {
                // if invalid chars was found: discard last
                // hex character if it is not part of a pair
                if (sBuf.length() % 2 != 0)
                {
                    sBuf.deleteCharAt(sBuf.length() - 1);
                }

				// read till the closing bracket was found
				do
				{
                    c = seqSource.read();
                }
                while (c != '>' && c >= 0);

				// might have reached EOF while looking for the closing bracket
				// this can happen for malformed PDFs only. Make sure that there is
				// no endless loop.
				if ( c < 0 )
				{
					throw new IOException( "Missing closing bracket for hex string. Reached EOS." );
				}

				// exit loop
				break;
			}
		}
		return COSString.parseHex(sBuf.toString());
	}

	/**
	 * This will parse a PDF array object.
	 *
	 * @return The parsed PDF array.
	 *
	 * @throws IOException If there is an error parsing the stream.
	 */
	protected COSArray parseCOSArray() throws IOException
	{
		readExpectedChar('[');
		COSArray po = new COSArray();
		COSBase pbo;
		skipSpaces();
		int i;
        while (((i = seqSource.peek()) > 0) && ((char) i != ']'))
        {
            pbo = parseDirObject();
            if (pbo instanceof COSObject)
            {
                // We have to check if the expected values are there or not PDFBOX-385
                if (po.get(po.size() - 1) instanceof COSInteger)
                {
                    COSInteger genNumber = (COSInteger) po.remove(po.size() - 1);
                    if (po.get(po.size() - 1) instanceof COSInteger)
                    {
                        COSInteger number = (COSInteger) po.remove(po.size() - 1);
                        COSObjectKey key = new COSObjectKey(number.longValue(),
                            genNumber.intValue());
                        pbo = getObjectFromPool(key);
                    }
                    else
                    {
                        // the object reference is somehow wrong
                        pbo = null;
                    }
                }
                else
                {
                    pbo = null;
                }
            }
            if (pbo != null)
            {
                po.add(pbo);
            }
            else
            {
                //it could be a bad object in the array which is just skipped
                Log.w("PdfBox-Android",
                    "Corrupt object reference at offset " + seqSource.getPosition());

				// This could also be an "endobj" or "endstream" which means we can assume that
				// the array has ended.
				String isThisTheEnd = readString();
                seqSource.unread(isThisTheEnd.getBytes(ISO_8859_1));
                if (ENDOBJ_STRING.equals(isThisTheEnd) || ENDSTREAM_STRING.equals(isThisTheEnd))
                {
                    return po;
                }
            }
            skipSpaces();
		}
		//read ']'
        seqSource.read();
        skipSpaces();
        return po;
    }

	/**
	 * Determine if a character terminates a PDF name.
	 *
	 * @param ch The character
     * @return true if the character terminates a PDF name, otherwise false.
     */
    protected boolean isEndOfName(int ch)
    {
        return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' ||
            ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(';
    }

	/**
	 * This will parse a PDF name from the stream.
	 *
	 * @return The parsed PDF name.
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected COSName parseCOSName() throws IOException
	{
		readExpectedChar('/');
        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
        int c = seqSource.read();
        while (c != -1)
        {
            int ch = c;
            if (ch == '#')
            {
                char ch1 = (char) seqSource.read();
                char ch2 = (char) seqSource.read();

				// Prior to PDF v1.2, the # was not a special character.  Also,
				// it has been observed that various PDF tools do not follow the
				// spec with respect to the # escape, even though they report
				// PDF versions of 1.2 or later.  The solution here is that we
				// interpret the # as an escape only when it is followed by two
				// valid hex digits.
				//
				if (isHexDigit(ch1) && isHexDigit(ch2))
				{
					String hex = "" + ch1 + ch2;
					try
					{
                        buffer.write(Integer.parseInt(hex, 16));
                    }
                    catch (NumberFormatException e)
                    {
                        throw new IOException("Error: expected hex digit, actual='" + hex + "'", e);
                    }
                    c = seqSource.read();
                }
                else
                {
                    seqSource.unread(ch2);
                    c = ch1;
                    buffer.write(ch);
                }
            }
            else if (isEndOfName(ch))
            {
				break;
			}
			else
			{
                buffer.write(ch);
                c = seqSource.read();
            }
        }
        if (c != -1)
        {
            seqSource.unread(c);
        }
        String string = new String(buffer.toByteArray(), Charsets.UTF_8);
        return COSName.getPDFName(string);
    }

	/**
	 * This will parse a boolean object from the stream.
	 *
	 * @return The parsed boolean object.
	 *
	 * @throws IOException If an IO error occurs during parsing.
	 */
	protected COSBoolean parseBoolean() throws IOException
	{
		COSBoolean retval = null;
        char c = (char) seqSource.peek();
        if (c == 't')
        {
            String trueString = new String(seqSource.readFully(4), ISO_8859_1);
            if (!trueString.equals(TRUE))
            {
                throw new IOException("Error parsing boolean: expected='true' actual='" + trueString
                    + "' at offset " + seqSource.getPosition());
            }
            else
            {
                retval = COSBoolean.TRUE;
            }
        }
		else if( c == 'f' )
		{
            String falseString = new String(seqSource.readFully(5), ISO_8859_1);
            if (!falseString.equals(FALSE))
            {
                throw new IOException(
                    "Error parsing boolean: expected='true' actual='" + falseString
                        + "' at offset " + seqSource.getPosition());
            }
            else
            {
                retval = COSBoolean.FALSE;
            }
        }
		else
		{
			throw new IOException( "Error parsing boolean expected='t or f' actual='" + c
                + "' at offset " + seqSource.getPosition());
        }
        return retval;
    }

	/**
	 * This will parse a directory object from the stream.
	 *
	 * @return The parsed object.
	 *
	 * @throws IOException If there is an error during parsing.
	 */
	protected COSBase parseDirObject() throws IOException
	{
		COSBase retval = null;

		skipSpaces();
        int nextByte = seqSource.peek();
        char c = (char) nextByte;
        switch (c)
        {
            case '<':
            {
                //pull off first left bracket
                int leftBracket = seqSource.read();
                //check for second left bracket
                c = (char) seqSource.peek();
                seqSource.unread(leftBracket);
            if (c == '<')
            {

				retval = parseCOSDictionary();
				skipSpaces();
			}
			else
			{
				retval = parseCOSString();
			}
			break;
		}
		case '[':
		{
			// array
			retval = parseCOSArray();
			break;
		}
		case '(':
			retval = parseCOSString();
			break;
		case '/':
			// name
			retval = parseCOSName();
			break;
		case 'n':
		{
			// null
			readExpectedString(NULL);
			retval = COSNull.NULL;
			break;
		}
		case 't':
		{
            String trueString = new String(seqSource.readFully(4), ISO_8859_1);
            if (trueString.equals(TRUE))
            {
                retval = COSBoolean.TRUE;
            }
            else
            {
                throw new IOException("expected true actual='" + trueString + "' " + seqSource +
                    "' at offset " + seqSource.getPosition());
            }
            break;
        }
            case 'f':
            {
                String falseString = new String(seqSource.readFully(5), ISO_8859_1);
                if (falseString.equals(FALSE))
                {
                    retval = COSBoolean.FALSE;
                }
                else
                {
                    throw new IOException(
                        "expected false actual='" + falseString + "' " + seqSource +
                            "' at offset " + seqSource.getPosition());
            }
            break;
        }
            case 'R':
                seqSource.read();
                retval = new COSObject(null);
                break;
            case (char) -1:
                return null;
            default:
            {
                if (Character.isDigit(c) || c == '-' || c == '+' || c == '.')
                {
                    StringBuilder buf = new StringBuilder();
                    int ic = seqSource.read();
                    c = (char) ic;
                    while (Character.isDigit(c) ||
                        c == '-' ||
                        c == '+' ||
                        c == '.' ||
                        c == 'E' ||
                        c == 'e')
                    {
                        buf.append(c);
                        ic = seqSource.read();
                        c = (char) ic;
                }
                    if (ic != -1)
                    {
                        seqSource.unread(ic);
                    }
                retval = COSNumber.get(buf.toString());
            }
            else
            {
                //This is not suppose to happen, but we will allow for it
				//so we are more compatible with POS writers that don't
				//follow the spec
				String badString = readString();
				if( badString == null || badString.length() == 0 )
				{
                    int peek = seqSource.peek();
                    // we can end up in an infinite loop otherwise
                    throw new IOException("Unknown dir object c='" + c +
                        "' cInt=" + (int) c + " peek='" + (char) peek
                        + "' peekInt=" + peek + " " + seqSource.getPosition());
                }

				// if it's an endstream/endobj, we want to put it back so the caller will see it
				if(ENDOBJ_STRING.equals(badString) || ENDSTREAM_STRING.equals(badString))
				{
                    seqSource.unread(badString.getBytes(ISO_8859_1));
                }
            }
        }
        }
        return retval;
    }

	/**
	 * This will read the next string from the stream.
	 *
	 * @return The string that was read from the stream.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected String readString() throws IOException
	{
		skipSpaces();
		StringBuilder buffer = new StringBuilder();
        int c = seqSource.read();
        while (!isEndOfName((char) c) && c != -1)
        {
            buffer.append((char) c);
            c = seqSource.read();
        }
        if (c != -1)
        {
            seqSource.unread(c);
        }
        return buffer.toString();
    }

	/**
	 * Read one String and throw an exception if it is not the expected value.
	 *
	 * @param expectedString the String value that is expected.
	 * @throws IOException if the String char is not the expected value or if an
	 * I/O error occurs.
	 */
	protected void readExpectedString(String expectedString) throws IOException
	{
		readExpectedString(expectedString.toCharArray(), false);
	}

	/**
     * Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
     *
     * @param expectedString pattern to be skipped
     * @param skipSpaces if set to true spaces before and after the string will be skipped
     * @throws IOException if pattern could not be read
     */
    protected final void readExpectedString(final char[] expectedString, boolean skipSpaces)
        throws IOException
    {
        skipSpaces();
        for (char c : expectedString)
        {
            if (seqSource.read() != c)
            {
                throw new IOException("Expected string '" + new String(expectedString)
                    + "' but missed at character '" + c + "' at offset "
                    + seqSource.getPosition());
            }
        }
        skipSpaces();
    }

	/**
	 * Read one char and throw an exception if it is not the expected value.
	 *
	 * @param ec the char value that is expected.
	 * @throws IOException if the read char is not the expected value or if an
	 * I/O error occurs.
	 */
	protected void readExpectedChar(char ec) throws IOException
	{
        char c = (char) seqSource.read();
        if (c != ec)
        {
            throw new IOException(
                "expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition());
        }
    }

	/**
	 * This will read the next string from the stream up to a certain length.
	 *
	 * @param length The length to stop reading at.
	 *
	 * @return The string that was read from the stream of length 0 to length.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected String readString( int length ) throws IOException
	{
		skipSpaces();

        int c = seqSource.read();

		//average string size is around 2 and the normal string buffer size is
		//about 16 so lets save some space.
		StringBuilder buffer = new StringBuilder(length);
		while( !isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length &&
				c != '[' &&
				c != '<' &&
				c != '(' &&
				c != '/' )
		{
			buffer.append( (char)c );
            c = seqSource.read();
        }
        if (c != -1)
        {
            seqSource.unread(c);
        }
        return buffer.toString();
    }

	/**
	 * This will tell if the next character is a closing brace( close of PDF array ).
	 *
	 * @return true if the next byte is ']', false otherwise.
	 *
	 * @throws IOException If an IO error occurs.
	 */
	protected boolean isClosing() throws IOException
	{
        return isClosing(seqSource.peek());
    }

	/**
	 * This will tell if the next character is a closing brace( close of PDF array ).
	 *
	 * @param c The character to check against end of line
	 * @return true if the next byte is ']', false otherwise.
	 */
	protected boolean isClosing(int c)
	{
		return c == ']';
	}

	/**
	 * This will read bytes until the first end of line marker occurs.
	 * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
	 * which is an important detail if one wants to unread the line.
	 *
	 * @return The characters between the current position and the end of the line.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected String readLine() throws IOException
	{
        if (seqSource.isEOF())
        {
            throw new IOException("Error: End-of-File, expected line");
        }

		StringBuilder buffer = new StringBuilder( 11 );

		int c;
        while ((c = seqSource.read()) != -1)
        {
            // CR and LF are valid EOLs
            if (isEOL(c))
            {
                break;
            }
            buffer.append((char) c);
        }
        // CR+LF is also a valid EOL
        if (isCR(c) && isLF(seqSource.peek()))
        {
            seqSource.read();
        }
        return buffer.toString();
    }

	/**
	 * This will tell if the next byte to be read is an end of line byte.
	 *
	 * @return true if the next byte is 0x0A or 0x0D.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected boolean isEOL() throws IOException
	{
        return isEOL(seqSource.peek());
    }

	/**
	 * This will tell if the next byte to be read is an end of line byte.
	 *
	 * @param c The character to check against end of line
	 * @return true if the next byte is 0x0A or 0x0D.
	 */
	protected boolean isEOL(int c)
	{
		return isLF(c) || isCR(c);
	}

	private boolean isLF(int c)
	{
		return ASCII_LF == c;
	}

	private boolean isCR(int c)
	{
		return ASCII_CR == c;
	}

	/**
	 * This will tell if the next byte is whitespace or not.
	 *
	 * @return true if the next byte in the stream is a whitespace character.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected boolean isWhitespace() throws IOException
	{
        return isWhitespace(seqSource.peek());
    }

	/**
	 * This will tell if a character is whitespace or not.  These values are
	 * specified in table 1 (page 12) of ISO 32000-1:2008.
	 * @param c The character to check against whitespace
	 * @return true if the character is a whitespace character.
	 */
	protected boolean isWhitespace( int c )
	{
		return c == 0 || c == 9 || c == 12 || c == ASCII_LF
				|| c == ASCII_CR || c == ASCII_SPACE;
	}

	/**
	 * This will tell if the next byte is a space or not.
	 *
	 * @return true if the next byte in the stream is a space character.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected boolean isSpace() throws IOException
	{
        return isSpace(seqSource.peek());
    }

	/**
	 * This will tell if the given value is a space or not.
	 *
	 * @param c The character to check against space
	 * @return true if the next byte in the stream is a space character.
	 */
	protected boolean isSpace(int c)
	{
		return ASCII_SPACE == c;
	}

	/**
	 * This will tell if the next byte is a digit or not.
	 *
	 * @return true if the next byte in the stream is a digit.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected boolean isDigit() throws IOException
	{
        return isDigit(seqSource.peek());
    }

	/**
	 * This will tell if the given value is a digit or not.
	 *
	 * @param c The character to be checked
	 * @return true if the next byte in the stream is a digit.
	 */
	protected static boolean isDigit(int c)
	{
		return c >= ASCII_ZERO && c <= ASCII_NINE;
	}

	/**
	 * This will skip all spaces and comments that are present.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected void skipSpaces() throws IOException
	{
        int c = seqSource.read();
        // 37 is the % character, a comment
        while (isWhitespace(c) || c == 37)
        {
            if (c == 37)
            {
                // skip past the comment section
                c = seqSource.read();
                while (!isEOL(c) && c != -1)
                {
                    c = seqSource.read();
                }
            }
            else
            {
                c = seqSource.read();
            }
        }
        if (c != -1)
        {
            seqSource.unread(c);
        }
        //log( "skipSpaces() done peek='" + (char)seqSource.peek() + "'" );
    }

	/**
	 * This will read a long from the Stream and throw an {@link IOException} if
	 * the long value is negative or has more than 10 digits (i.e. : bigger than
	 * {@link #OBJECT_NUMBER_THRESHOLD})
	 * @return the object number being read.
	 * @throws IOException if an I/O error occurs
	 */
	protected int readObjectNumber() throws IOException
	{
		int retval = readInt();
		if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
		{
			throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
		}
		return retval;
	}

	/**
	 * This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
	 * has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
	 * @return the generation number being read.
	 * @throws IOException if an I/O error occurs
	 */
	protected int readGenerationNumber() throws IOException
	{
		int retval = readInt();
		if(retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
		{
			throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
		}
		return retval;
	}

	/**
	 * This will read an integer from the stream.
	 *
	 * @return The integer that was read from the stream.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected int readInt() throws IOException
	{
		skipSpaces();
		int retval = 0;

		StringBuilder intBuffer = readStringNumber();

		try
		{
			retval = Integer.parseInt( intBuffer.toString() );
		}
		catch( NumberFormatException e )
		{
            seqSource.unread(intBuffer.toString().getBytes(ISO_8859_1));
            throw new IOException(
                "Error: Expected an integer type at offset " + seqSource.getPosition(), e);
        }
        return retval;
    }


	/**
	 * This will read an long from the stream.
	 *
	 * @return The long that was read from the stream.
	 *
	 * @throws IOException If there is an error reading from the stream.
	 */
	protected long readLong() throws IOException
	{
		skipSpaces();
		long retval = 0;

		StringBuilder longBuffer = readStringNumber();

		try
		{
			retval = Long.parseLong( longBuffer.toString() );
		}
		catch( NumberFormatException e )
		{
            seqSource.unread(longBuffer.toString().getBytes(ISO_8859_1));
            throw new IOException("Error: Expected a long type at offset "
                + seqSource.getPosition() + ", instead got '" + longBuffer + "'", e);
        }
        return retval;
    }

	/**
	 * This method is used to read a token by the {@linkplain #readInt()} method
	 * and the {@linkplain #readLong()} method.
	 *
	 * @return the token to parse as integer or long by the calling method.
     * @throws IOException throws by the {@link #seqSource} methods.
     */
    protected final StringBuilder readStringNumber() throws IOException
    {
        int lastByte = 0;
        StringBuilder buffer = new StringBuilder();
        while ((lastByte = seqSource.read()) != ASCII_SPACE &&
            lastByte != ASCII_LF &&
            lastByte != ASCII_CR &&
            lastByte != 60 && //see sourceforge bug 1714707
            lastByte != '[' && // PDFBOX-1845
            lastByte != '(' && // PDFBOX-2579
            lastByte != 0 && //See sourceforge bug 853328
            lastByte != -1)
        {
            buffer.append((char) lastByte);
        }
        if (lastByte != -1)
        {
            seqSource.unread(lastByte);
        }
        return buffer;
    }
}