PDFStreamParser.java example

Explorer
PdfBox-Android-master
- library
  - src
- sample
  - src
    - main
      - java
        com
        tom_roush
        pdfbox
        sample
        MainActivity.java
package com.tom_roush.pdfbox.pdfparser;

import android.util.Log;

import com.tom_roush.pdfbox.contentstream.PDContentStream;
import com.tom_roush.pdfbox.contentstream.operator.Operator;
import com.tom_roush.pdfbox.cos.COSBase;
import com.tom_roush.pdfbox.cos.COSBoolean;
import com.tom_roush.pdfbox.cos.COSDictionary;
import com.tom_roush.pdfbox.cos.COSName;
import com.tom_roush.pdfbox.cos.COSNull;
import com.tom_roush.pdfbox.cos.COSNumber;
import com.tom_roush.pdfbox.cos.COSObject;
import com.tom_roush.pdfbox.cos.COSStream;
import com.tom_roush.pdfbox.pdmodel.common.PDStream;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * This will parse a PDF byte stream and extract operands and such.
 *
 * @author Ben Litchfield
 */
public class PDFStreamParser extends BaseParser
{
    private final List<Object> streamObjects = new ArrayList<Object>(100);

    private static final int MAX_BIN_CHAR_TEST_LENGTH = 10;
    private final byte[] binCharTestArr = new byte[MAX_BIN_CHAR_TEST_LENGTH];

    /**
     * Constructor.
     *
     * @param stream The stream to parse.
     * @throws IOException If there is an error initializing the stream.
     * @deprecated Use {@link PDFStreamParser#PDFStreamParser(PDContentStream)} instead.
     */
    @Deprecated
    public PDFStreamParser(PDStream stream) throws IOException
    {
        super(new InputStreamSource(stream.createInputStream()));
    }

    /**
     * Constructor.
     *
     * @param stream The stream to parse.
     * @throws IOException If there is an error initializing the stream.
     * @deprecated Use {@link PDFStreamParser#PDFStreamParser(PDContentStream)} instead.
     */
    @Deprecated
    public PDFStreamParser(COSStream stream) throws IOException
    {
        super(new InputStreamSource(stream.getUnfilteredStream()));
    }

    /**
     * Constructor.
     *
     * @param contentStream The content stream to parse.
     * @throws IOException If there is an error initializing the stream.
     */
    public PDFStreamParser(PDContentStream contentStream) throws IOException
    {
        super(new InputStreamSource(contentStream.getContents()));
    }

    /**
     * Constructor.
     *
     * @param bytes the bytes to parse.
     * @throws IOException If there is an error initializing the stream.
     */
    public PDFStreamParser(byte[] bytes) throws IOException
    {
        super(new InputStreamSource(new ByteArrayInputStream(bytes)));
    }

    /**
     * This will parse the tokens in the stream.  This will close the
     * stream when it is finished parsing.
     *
     * @throws IOException If there is an error while parsing the stream.
     */
    public void parse() throws IOException
    {
        Object token;
        while ((token = parseNextToken()) != null)
        {
            streamObjects.add(token);
        }
    }

    /**
     * This will get the tokens that were parsed from the stream.
     *
     * @return All of the tokens in the stream.
     */
    public List<Object> getTokens()
    {
        return streamObjects;
    }

    /**
     * This will parse the next token in the stream.
     *
     * @return The next token in the stream or null if there are no more tokens in the stream.
     * @throws IOException If an io error occurs while parsing the stream.
     */
    public Object parseNextToken() throws IOException
    {
        Object retval;

        skipSpaces();
        int nextByte = seqSource.peek();
        if (((byte) nextByte) == -1)
        {
            return null;
        }
        char c = (char) nextByte;
        switch (c)
        {
            case '<':
            {
                //pull off first left bracket
                int leftBracket = seqSource.read();

                //check for second left bracket
                c = (char) seqSource.peek();

                //put back first bracket
                seqSource.unread(leftBracket);

                if (c == '<')
                {
                    retval = parseCOSDictionary();
                }
                else
                {
                    retval = parseCOSString();
                }
                break;
            }
            case '[':
            {
                // array
                retval = parseCOSArray();
                break;
            }
            case '(':
                // string
                retval = parseCOSString();
                break;
            case '/':
                // name
                retval = parseCOSName();
                break;
            case 'n':
            {
                // null
                String nullString = readString();
                if (nullString.equals("null"))
                {
                    retval = COSNull.NULL;
                }
                else
                {
                    retval = Operator.getOperator(nullString);
                }
                break;
            }
            case 't':
            case 'f':
            {
                String next = readString();
                if (next.equals("true"))
                {
                    retval = COSBoolean.TRUE;
                    break;
                }
                else if (next.equals("false"))
                {
                    retval = COSBoolean.FALSE;
                }
                else
                {
                    retval = Operator.getOperator(next);
                }
                break;
            }
            case 'R':
            {
                String line = readString();
                if (line.equals("R"))
                {
                    retval = new COSObject(null);
                }
                else
                {
                    retval = Operator.getOperator(line);
                }
                break;
            }
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            case '-':
            case '+':
            case '.':
            {
            /* We will be filling buf with the rest of the number.  Only
			 * allow 1 "." and "-" and "+" at start of number. */
                StringBuffer buf = new StringBuffer();
                buf.append(c);
                seqSource.read();

                boolean dotNotRead = c != '.';
                while (Character.isDigit(c = (char) seqSource.peek()) || dotNotRead && c == '.')
                {
                    buf.append(c);
                    seqSource.read();

                    if (dotNotRead && c == '.')
                    {
                        dotNotRead = false;
                    }
                }
                retval = COSNumber.get(buf.toString());
                break;
            }
            case 'B':
            {
                String next = readString();
                retval = Operator.getOperator(next);
                if (next.equals("BI"))
                {
                    Operator beginImageOP = (Operator) retval;
                    COSDictionary imageParams = new COSDictionary();
                    beginImageOP.setImageParameters(imageParams);
                    Object nextToken = null;
                    while ((nextToken = parseNextToken()) instanceof COSName)
                    {
                        Object value = parseNextToken();
                        imageParams.setItem((COSName) nextToken, (COSBase) value);
                    }
                    //final token will be the image data, maybe??
                    Operator imageData = (Operator) nextToken;
                    beginImageOP.setImageData(imageData.getImageData());
                }
                break;
            }
            case 'I':
            {
                //Special case for ID operator
                String id = "" + (char) seqSource.read() + (char) seqSource.read();
                if (!id.equals("ID"))
                {
                    throw new IOException("Error: Expected operator 'ID' actual='" + id + "'");
                }
                ByteArrayOutputStream imageData = new ByteArrayOutputStream();
                if (isWhitespace())
                {
                    //pull off the whitespace character
                    seqSource.read();
                }
                int lastByte = seqSource.read();
                int currentByte = seqSource.read();
                // PDF spec is kinda unclear about this. Should a whitespace
                // always appear before EI? Not sure, so that we just read
                // until EI<whitespace>.
                // Be aware not all kind of whitespaces are allowed here. see PDFBOX-1561
                while (!(lastByte == 'E' &&
                    currentByte == 'I' &&
                    hasNextSpaceOrReturn() &&
                    hasNoFollowingBinData(seqSource)) &&
                    !seqSource.isEOF())
                {
                    imageData.write(lastByte);
                    lastByte = currentByte;
                    currentByte = seqSource.read();
                }
                // the EI operator isn't unread, as it won't be processed anyway
                retval = Operator.getOperator("ID");
                // save the image data to the operator, so that it can be accessed later
                ((Operator) retval).setImageData(imageData.toByteArray());
                break;
            }
            case ']':
            {
                // some ']' around without its previous '['
                // this means a PDF is somewhat corrupt but we will continue to parse.
                seqSource.read();

                // must be a better solution than null...
                retval = COSNull.NULL;
                break;
            }
            default:
            {
                //we must be an operator
                String operator = readOperator();
                if (operator.trim().length() == 0)
                {
                    //we have a corrupt stream, stop reading here
                    retval = null;
                }
                else
                {
                    retval = Operator.getOperator(operator);
                }
            }
        }
        return retval;
    }

    /**
     * Looks up an amount of bytes if they contain only ASCII characters (no
     * control sequences etc.), and that these ASCII characters begin with a
     * sequence of 1-3 non-blank characters between blanks
     *
     * @return <code>true</code> if next bytes are probably printable ASCII
     * characters starting with a PDF operator, otherwise <code>false</code>
     */
    private boolean hasNoFollowingBinData(SequentialSource pdfSource) throws IOException
    {
        // as suggested in PDFBOX-1164
        final int readBytes = pdfSource.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH);
        boolean noBinData = true;
        int startOpIdx = -1;
        int endOpIdx = -1;

        if (readBytes > 0)
        {
            for (int bIdx = 0; bIdx < readBytes; bIdx++)
            {
                final byte b = binCharTestArr[bIdx];
                if ((b < 0x09) || ((b > 0x0a) && (b < 0x20) && (b != 0x0d)))
                {
                    // control character or > 0x7f -> we have binary data
                    noBinData = false;
                    break;
                }
                // find the start of a PDF operator
                if (startOpIdx == -1 && !(b == 9 || b == 0x20 || b == 0x0a || b == 0x0d))
                {
                    startOpIdx = bIdx;
                }
                else if (startOpIdx != -1 && endOpIdx == -1 &&
                    (b == 9 || b == 0x20 || b == 0x0a || b == 0x0d))
                {
                    endOpIdx = bIdx;
                }
            }

            // only if not close to eof
            if (readBytes == MAX_BIN_CHAR_TEST_LENGTH)
            {
                // a PDF operator is 1-3 bytes long
                if (startOpIdx != -1 && endOpIdx == -1)
                {
                    endOpIdx = MAX_BIN_CHAR_TEST_LENGTH;
                }
                if (endOpIdx != -1 && startOpIdx != -1 && endOpIdx - startOpIdx > 3)
                {
                    noBinData = false;
                }
            }
            pdfSource.unread(Arrays.copyOfRange(binCharTestArr, 0, readBytes));
        }

        if (!noBinData)
        {
            Log.w("PdfBox-Android", "ignoring 'EI' assumed to be in the middle of inline image");
        }

        return noBinData;
    }

    /**
     * Check whether the output stream ends with 70 ASCII85 data bytes
     * (33..117). This method is to be called when "EI" and then space/LF/CR
     * are detected.
     *
     * @param imageData output data stream without the "EI"
     * @return true if this is an ASCII85 line so the "EI" is to be considered
     * part of the data stream, false if not
     */
    private boolean hasPrecedingAscii85Data(ByteArrayOutputStream imageData)
    {
        if (imageData.size() < 70)
        {
            return false;
        }
        byte[] tab = imageData.toByteArray();
        for (int i = tab.length - 1; i >= tab.length - 70; --i)
        {
            if (tab[i] < 33 || tab[i] > 117)
            {
                return false;
            }
        }
        return true;
    }

    /**
     * This will read an operator from the stream.
     *
     * @return The operator that was read from the stream.
     * @throws IOException If there is an error reading from the stream.
     */
    protected String readOperator() throws IOException
    {
        skipSpaces();

        //average string size is around 2 and the normal string buffer size is
        //about 16 so lets save some space.
        StringBuffer buffer = new StringBuffer(4);
        int nextChar = seqSource.peek();
        while (
            nextChar != -1 && // EOF
                !isWhitespace(nextChar) &&
                !isClosing(nextChar) &&
                nextChar != '[' &&
                nextChar != '<' &&
                nextChar != '(' &&
                nextChar != '/' &&
                (nextChar < '0' ||
                    nextChar > '9'))
        {
            char currentChar = (char) seqSource.read();
            nextChar = seqSource.peek();
            buffer.append(currentChar);
            // Type3 Glyph description has operators with a number in the name
            if (currentChar == 'd' && (nextChar == '0' || nextChar == '1'))
            {
                buffer.append((char) seqSource.read());
                nextChar = seqSource.peek();
            }
        }
        return buffer.toString();
    }


    private boolean isSpaceOrReturn(int c)
    {
        return c == 10 || c == 13 || c == 32;
    }

    /**
     * Checks if the next char is a space or a return.
     *
     * @return true if the next char is a space or a return
     * @throws IOException if something went wrong
     */
    private boolean hasNextSpaceOrReturn() throws IOException
    {
        return isSpaceOrReturn(seqSource.peek());
    }
}