package com.tom_roush.pdfbox.pdfparser; import android.util.Log; import com.tom_roush.pdfbox.contentstream.PDContentStream; import com.tom_roush.pdfbox.contentstream.operator.Operator; import com.tom_roush.pdfbox.cos.COSBase; import com.tom_roush.pdfbox.cos.COSBoolean; import com.tom_roush.pdfbox.cos.COSDictionary; import com.tom_roush.pdfbox.cos.COSName; import com.tom_roush.pdfbox.cos.COSNull; import com.tom_roush.pdfbox.cos.COSNumber; import com.tom_roush.pdfbox.cos.COSObject; import com.tom_roush.pdfbox.cos.COSStream; import com.tom_roush.pdfbox.pdmodel.common.PDStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * This will parse a PDF byte stream and extract operands and such. * * @author Ben Litchfield */ public class PDFStreamParser extends BaseParser { private final List<Object> streamObjects = new ArrayList<Object>(100); private static final int MAX_BIN_CHAR_TEST_LENGTH = 10; private final byte[] binCharTestArr = new byte[MAX_BIN_CHAR_TEST_LENGTH]; /** * Constructor. * * @param stream The stream to parse. * @throws IOException If there is an error initializing the stream. * @deprecated Use {@link PDFStreamParser#PDFStreamParser(PDContentStream)} instead. */ @Deprecated public PDFStreamParser(PDStream stream) throws IOException { super(new InputStreamSource(stream.createInputStream())); } /** * Constructor. * * @param stream The stream to parse. * @throws IOException If there is an error initializing the stream. * @deprecated Use {@link PDFStreamParser#PDFStreamParser(PDContentStream)} instead. */ @Deprecated public PDFStreamParser(COSStream stream) throws IOException { super(new InputStreamSource(stream.getUnfilteredStream())); } /** * Constructor. * * @param contentStream The content stream to parse. * @throws IOException If there is an error initializing the stream. */ public PDFStreamParser(PDContentStream contentStream) throws IOException { super(new InputStreamSource(contentStream.getContents())); } /** * Constructor. * * @param bytes the bytes to parse. * @throws IOException If there is an error initializing the stream. */ public PDFStreamParser(byte[] bytes) throws IOException { super(new InputStreamSource(new ByteArrayInputStream(bytes))); } /** * This will parse the tokens in the stream. This will close the * stream when it is finished parsing. * * @throws IOException If there is an error while parsing the stream. */ public void parse() throws IOException { Object token; while ((token = parseNextToken()) != null) { streamObjects.add(token); } } /** * This will get the tokens that were parsed from the stream. * * @return All of the tokens in the stream. */ public List<Object> getTokens() { return streamObjects; } /** * This will parse the next token in the stream. * * @return The next token in the stream or null if there are no more tokens in the stream. * @throws IOException If an io error occurs while parsing the stream. */ public Object parseNextToken() throws IOException { Object retval; skipSpaces(); int nextByte = seqSource.peek(); if (((byte) nextByte) == -1) { return null; } char c = (char) nextByte; switch (c) { case '<': { //pull off first left bracket int leftBracket = seqSource.read(); //check for second left bracket c = (char) seqSource.peek(); //put back first bracket seqSource.unread(leftBracket); if (c == '<') { retval = parseCOSDictionary(); } else { retval = parseCOSString(); } break; } case '[': { // array retval = parseCOSArray(); break; } case '(': // string retval = parseCOSString(); break; case '/': // name retval = parseCOSName(); break; case 'n': { // null String nullString = readString(); if (nullString.equals("null")) { retval = COSNull.NULL; } else { retval = Operator.getOperator(nullString); } break; } case 't': case 'f': { String next = readString(); if (next.equals("true")) { retval = COSBoolean.TRUE; break; } else if (next.equals("false")) { retval = COSBoolean.FALSE; } else { retval = Operator.getOperator(next); } break; } case 'R': { String line = readString(); if (line.equals("R")) { retval = new COSObject(null); } else { retval = Operator.getOperator(line); } break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '-': case '+': case '.': { /* We will be filling buf with the rest of the number. Only * allow 1 "." and "-" and "+" at start of number. */ StringBuffer buf = new StringBuffer(); buf.append(c); seqSource.read(); boolean dotNotRead = c != '.'; while (Character.isDigit(c = (char) seqSource.peek()) || dotNotRead && c == '.') { buf.append(c); seqSource.read(); if (dotNotRead && c == '.') { dotNotRead = false; } } retval = COSNumber.get(buf.toString()); break; } case 'B': { String next = readString(); retval = Operator.getOperator(next); if (next.equals("BI")) { Operator beginImageOP = (Operator) retval; COSDictionary imageParams = new COSDictionary(); beginImageOP.setImageParameters(imageParams); Object nextToken = null; while ((nextToken = parseNextToken()) instanceof COSName) { Object value = parseNextToken(); imageParams.setItem((COSName) nextToken, (COSBase) value); } //final token will be the image data, maybe?? Operator imageData = (Operator) nextToken; beginImageOP.setImageData(imageData.getImageData()); } break; } case 'I': { //Special case for ID operator String id = "" + (char) seqSource.read() + (char) seqSource.read(); if (!id.equals("ID")) { throw new IOException("Error: Expected operator 'ID' actual='" + id + "'"); } ByteArrayOutputStream imageData = new ByteArrayOutputStream(); if (isWhitespace()) { //pull off the whitespace character seqSource.read(); } int lastByte = seqSource.read(); int currentByte = seqSource.read(); // PDF spec is kinda unclear about this. Should a whitespace // always appear before EI? Not sure, so that we just read // until EI<whitespace>. // Be aware not all kind of whitespaces are allowed here. see PDFBOX-1561 while (!(lastByte == 'E' && currentByte == 'I' && hasNextSpaceOrReturn() && hasNoFollowingBinData(seqSource)) && !seqSource.isEOF()) { imageData.write(lastByte); lastByte = currentByte; currentByte = seqSource.read(); } // the EI operator isn't unread, as it won't be processed anyway retval = Operator.getOperator("ID"); // save the image data to the operator, so that it can be accessed later ((Operator) retval).setImageData(imageData.toByteArray()); break; } case ']': { // some ']' around without its previous '[' // this means a PDF is somewhat corrupt but we will continue to parse. seqSource.read(); // must be a better solution than null... retval = COSNull.NULL; break; } default: { //we must be an operator String operator = readOperator(); if (operator.trim().length() == 0) { //we have a corrupt stream, stop reading here retval = null; } else { retval = Operator.getOperator(operator); } } } return retval; } /** * Looks up an amount of bytes if they contain only ASCII characters (no * control sequences etc.), and that these ASCII characters begin with a * sequence of 1-3 non-blank characters between blanks * * @return <code>true</code> if next bytes are probably printable ASCII * characters starting with a PDF operator, otherwise <code>false</code> */ private boolean hasNoFollowingBinData(SequentialSource pdfSource) throws IOException { // as suggested in PDFBOX-1164 final int readBytes = pdfSource.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH); boolean noBinData = true; int startOpIdx = -1; int endOpIdx = -1; if (readBytes > 0) { for (int bIdx = 0; bIdx < readBytes; bIdx++) { final byte b = binCharTestArr[bIdx]; if ((b < 0x09) || ((b > 0x0a) && (b < 0x20) && (b != 0x0d))) { // control character or > 0x7f -> we have binary data noBinData = false; break; } // find the start of a PDF operator if (startOpIdx == -1 && !(b == 9 || b == 0x20 || b == 0x0a || b == 0x0d)) { startOpIdx = bIdx; } else if (startOpIdx != -1 && endOpIdx == -1 && (b == 9 || b == 0x20 || b == 0x0a || b == 0x0d)) { endOpIdx = bIdx; } } // only if not close to eof if (readBytes == MAX_BIN_CHAR_TEST_LENGTH) { // a PDF operator is 1-3 bytes long if (startOpIdx != -1 && endOpIdx == -1) { endOpIdx = MAX_BIN_CHAR_TEST_LENGTH; } if (endOpIdx != -1 && startOpIdx != -1 && endOpIdx - startOpIdx > 3) { noBinData = false; } } pdfSource.unread(Arrays.copyOfRange(binCharTestArr, 0, readBytes)); } if (!noBinData) { Log.w("PdfBox-Android", "ignoring 'EI' assumed to be in the middle of inline image"); } return noBinData; } /** * Check whether the output stream ends with 70 ASCII85 data bytes * (33..117). This method is to be called when "EI" and then space/LF/CR * are detected. * * @param imageData output data stream without the "EI" * @return true if this is an ASCII85 line so the "EI" is to be considered * part of the data stream, false if not */ private boolean hasPrecedingAscii85Data(ByteArrayOutputStream imageData) { if (imageData.size() < 70) { return false; } byte[] tab = imageData.toByteArray(); for (int i = tab.length - 1; i >= tab.length - 70; --i) { if (tab[i] < 33 || tab[i] > 117) { return false; } } return true; } /** * This will read an operator from the stream. * * @return The operator that was read from the stream. * @throws IOException If there is an error reading from the stream. */ protected String readOperator() throws IOException { skipSpaces(); //average string size is around 2 and the normal string buffer size is //about 16 so lets save some space. StringBuffer buffer = new StringBuffer(4); int nextChar = seqSource.peek(); while ( nextChar != -1 && // EOF !isWhitespace(nextChar) && !isClosing(nextChar) && nextChar != '[' && nextChar != '<' && nextChar != '(' && nextChar != '/' && (nextChar < '0' || nextChar > '9')) { char currentChar = (char) seqSource.read(); nextChar = seqSource.peek(); buffer.append(currentChar); // Type3 Glyph description has operators with a number in the name if (currentChar == 'd' && (nextChar == '0' || nextChar == '1')) { buffer.append((char) seqSource.read()); nextChar = seqSource.peek(); } } return buffer.toString(); } private boolean isSpaceOrReturn(int c) { return c == 10 || c == 13 || c == 32; } /** * Checks if the next char is a space or a return. * * @return true if the next char is a space or a return * @throws IOException if something went wrong */ private boolean hasNextSpaceOrReturn() throws IOException { return isSpaceOrReturn(seqSource.peek()); } }