/* * Copyright 2010 adam. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * under the License. */ package org.apache.pdfbox.pdfparser; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Set; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSFloat; import org.apache.pdfbox.cos.COSInteger; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.cos.COSUnread; import org.apache.pdfbox.io.RandomAccess; import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdmodel.ConformingPDDocument; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.common.XrefEntry; import org.apache.pdfbox.persistence.util.COSObjectKey; /** * * @author <a href="adam@apache.org">Adam Nichols</a> */ public class ConformingPDFParser extends BaseParser { protected RandomAccess inputFile; List<XrefEntry> xrefEntries; private long currentOffset; private ConformingPDDocument doc = null; private boolean throwNonConformingException = true; private boolean recursivlyRead = true; /** * Constructor. * * @param input The input stream that contains the PDF document. * * @throws IOException If there is an error initializing the stream. */ public ConformingPDFParser(File inputFile) throws IOException { this.inputFile = new RandomAccessFile(inputFile, "r"); } /** * This will parse the stream and populate the COSDocument object. This will close * the stream when it is done parsing. * * @throws IOException If there is an error reading from the stream or corrupt data * is found. */ public void parse() throws IOException { document = new COSDocument(); doc = new ConformingPDDocument(document); currentOffset = inputFile.length()-1; long xRefTableLocation = parseTrailerInformation(); currentOffset = xRefTableLocation; parseXrefTable(); // now that we read the xref table and put null references in the doc, // we can deference those objects now. boolean oldValue = recursivlyRead; recursivlyRead = false; List<COSObjectKey> keys = doc.getObjectKeysFromPool(); for(COSObjectKey key : keys) { // getObject will put it into the document's object pool for us getObject(key.getNumber(), key.getGeneration()); } recursivlyRead = oldValue; } /** * This will get the document that was parsed. parse() must be called before this is called. * When you are done with this document you must call close() on it to release * resources. * * @return The document that was parsed. * * @throws IOException If there is an error getting the document. */ public COSDocument getDocument() throws IOException { if( document == null ) { throw new IOException( "You must call parse() before calling getDocument()" ); } return document; } /** * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ public PDDocument getPDDocument() throws IOException { return doc; } private boolean parseXrefTable() throws IOException { String currentLine = readLine(); if(throwNonConformingException) { if(!"xref".equals(currentLine)) throw new AssertionError("xref table not found.\nExpected: xref\nFound: "+currentLine); } int objectNumber = readInt(); int entries = readInt(); xrefEntries = new ArrayList<XrefEntry>(entries); for(int i=0; i<entries; i++) xrefEntries.add(new XrefEntry(objectNumber++, readInt(), readInt(), readLine())); return true; } protected long parseTrailerInformation() throws IOException, NumberFormatException { long xrefLocation = -1; consumeWhitespaceBackwards(); String currentLine = readLineBackwards(); if(throwNonConformingException) { if(!"%%EOF".equals(currentLine)) throw new AssertionError("Invalid EOF marker.\nExpected: %%EOF\nFound: "+currentLine); } xrefLocation = readLongBackwards(); currentLine = readLineBackwards(); if(throwNonConformingException) { if(!"startxref".equals(currentLine)) throw new AssertionError("Invalid trailer.\nExpected: startxref\nFound: "+currentLine); } document.setTrailer(readDictionaryBackwards()); consumeWhitespaceBackwards(); currentLine = readLineBackwards(); if(throwNonConformingException) { if(!"trailer".equals(currentLine)) throw new AssertionError("Invalid trailer.\nExpected: trailer\nFound: "+currentLine); } return xrefLocation; } protected byte readByteBackwards() throws IOException { inputFile.seek(currentOffset); byte singleByte = (byte)inputFile.read(); currentOffset--; return singleByte; } protected byte readByte() throws IOException { inputFile.seek(currentOffset); byte singleByte = (byte)inputFile.read(); currentOffset++; return singleByte; } protected String readBackwardUntilWhitespace() throws IOException { StringBuilder sb = new StringBuilder(); byte singleByte = readByteBackwards(); while(!isWhitespace(singleByte)) { sb.insert(0, (char)singleByte); singleByte = readByteBackwards(); } return sb.toString(); } /** * This will read all bytes (backwards) until a non-whitespace character is * found. To save you an extra read, the non-whitespace character is * returned. If the current character is not whitespace, this method will * just return the current char. * @return the first non-whitespace character found * @throws IOException if there is an error reading from the file */ protected byte consumeWhitespaceBackwards() throws IOException { inputFile.seek(currentOffset); byte singleByte = (byte)inputFile.read(); if(!isWhitespace(singleByte)) return singleByte; // we have some whitespace, let's consume it while(isWhitespace(singleByte)) { singleByte = readByteBackwards(); } // readByteBackwards will decrement the currentOffset to point the byte // before the one just read, so we increment it back to the current byte currentOffset++; return singleByte; } /** * This will read all bytes until a non-whitespace character is * found. To save you an extra read, the non-whitespace character is * returned. If the current character is not whitespace, this method will * just return the current char. * @return the first non-whitespace character found * @throws IOException if there is an error reading from the file */ protected byte consumeWhitespace() throws IOException { inputFile.seek(currentOffset); byte singleByte = (byte)inputFile.read(); if(!isWhitespace(singleByte)) return singleByte; // we have some whitespace, let's consume it while(isWhitespace(singleByte)) { singleByte = readByte(); } // readByte() will increment the currentOffset to point the byte // after the one just read, so we decrement it back to the current byte currentOffset--; return singleByte; } /** * This will consume any whitespace, read in bytes until whitespace is found * again and then parse the characters which have been read as a long. The * current offset will then point at the first whitespace character which * preceeds the number. * @return the parsed number * @throws IOException if there is an error reading from the file * @throws NumberFormatException if the bytes read can not be converted to a number */ protected long readLongBackwards() throws IOException, NumberFormatException { StringBuilder sb = new StringBuilder(); consumeWhitespaceBackwards(); byte singleByte = readByteBackwards(); while(!isWhitespace(singleByte)) { sb.insert(0, (char)singleByte); singleByte = readByteBackwards(); } if(sb.length() == 0) throw new AssertionError("Number not found. Expected number at offset: " + currentOffset); return Long.parseLong(sb.toString()); } @Override protected int readInt() throws IOException { StringBuilder sb = new StringBuilder(); consumeWhitespace(); byte singleByte = readByte(); while(!isWhitespace(singleByte)) { sb.append((char)singleByte); singleByte = readByte(); } if(sb.length() == 0) throw new AssertionError("Number not found. Expected number at offset: " + currentOffset); return Integer.parseInt(sb.toString()); } /** * This will read in a number and return the COS version of the number (be * it a COSInteger or a COSFloat). * @return the COSNumber which was read/parsed * @throws IOException */ protected COSNumber readNumber() throws IOException { StringBuilder sb = new StringBuilder(); consumeWhitespace(); byte singleByte = readByte(); while(!isWhitespace(singleByte)) { sb.append((char)singleByte); singleByte = readByte(); } if(sb.length() == 0) throw new AssertionError("Number not found. Expected number at offset: " + currentOffset); return parseNumber(sb.toString()); } protected COSNumber parseNumber(String number) throws IOException { if(number.matches("^[0-9]+$")) return COSInteger.get(number); return new COSFloat(Float.parseFloat(number)); } protected COSBase processCosObject(String string) throws IOException { if(string != null && string.endsWith(">")) { // string of hex codes return COSString.createFromHexString(string.replaceAll("^<", "").replaceAll(">$", "")); } return null; } protected COSBase readObjectBackwards() throws IOException { COSBase obj = null; consumeWhitespaceBackwards(); String lastSection = readBackwardUntilWhitespace(); if("R".equals(lastSection)) { // indirect reference long gen = readLongBackwards(); long number = readLongBackwards(); // We just put a placeholder in the pool for now, we'll read the data later doc.putObjectInPool(new COSUnread(), number, gen); obj = new COSUnread(number, gen, this); } else if(">>".equals(lastSection)) { // dictionary throw new RuntimeException("Not yet implemented"); } else if(lastSection != null && lastSection.endsWith("]")) { // array COSArray array = new COSArray(); lastSection = lastSection.replaceAll("]$", ""); while(!lastSection.startsWith("[")) { if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", ""))); lastSection = readBackwardUntilWhitespace(); } lastSection = lastSection.replaceAll("^\\[", ""); if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", ""))); obj = array; } else if(lastSection != null && lastSection.endsWith(">")) { // string of hex codes obj = processCosObject(lastSection); } else { // try a number, otherwise fall back on a string try { Long.parseLong(lastSection); obj = COSNumber.get(lastSection); } catch(NumberFormatException e) { throw new RuntimeException("Not yet implemented"); } } return obj; } protected COSName readNameBackwards() throws IOException { String name = readBackwardUntilWhitespace(); name = name.replaceAll("^/", ""); return COSName.getPDFName(name); } public COSBase getObject(long objectNumber, long generation) throws IOException { // we could optionally, check to see if parse() have been called & // throw an exception here, but I don't think that's really necessary XrefEntry entry = xrefEntries.get((int)objectNumber); currentOffset = entry.getByteOffset(); return readObject(objectNumber, generation); } /** * This will read an object from the inputFile at whatever our currentOffset * is. If the object and generation are not the expected values and this * object is set to throw an exception for non-conforming documents, then an * exception will be thrown. * @param objectNumber the object number you expect to read * @param generation the generation you expect this object to be * @return */ public COSBase readObject(long objectNumber, long generation) throws IOException { // when recursivly reading, we always pull the object from the filesystem if(document != null && recursivlyRead) { // check to see if it is in the document cache before hitting the filesystem COSBase obj = doc.getObjectFromPool(objectNumber, generation); if(obj != null) return obj; } int actualObjectNumber = readInt(); if(objectNumber != actualObjectNumber) if(throwNonConformingException) throw new AssertionError("Object numer expected was " + objectNumber + " but actual was " + actualObjectNumber); consumeWhitespace(); int actualGeneration = readInt(); if(generation != actualGeneration) if(throwNonConformingException) throw new AssertionError("Generation expected was " + generation + " but actual was " + actualGeneration); consumeWhitespace(); String obj = readWord(); if(!"obj".equals(obj)) if(throwNonConformingException) throw new AssertionError("Expected keyword 'obj' but found " + obj); // put placeholder object in doc to prevent infinite recursion // e.g. read Root -> dereference object -> read object which has /Parent -> GOTO read Root doc.putObjectInPool(new COSObject(null), objectNumber, generation); COSBase object = readObject(); doc.putObjectInPool(object, objectNumber, generation); return object; } /** * This actually reads the object data. * @return the object which is read * @throws IOException */ protected COSBase readObject() throws IOException { consumeWhitespace(); String string = readWord(); if(string.startsWith("<<")) { // this is a dictionary COSDictionary dictionary = new COSDictionary(); boolean atEndOfDictionary = false; // remove the marker for the beginning of the dictionary string = string.replaceAll("^<<", ""); if("".equals(string) || string.matches("^\\w$")) string = readWord().trim(); while(!atEndOfDictionary) { COSName name = COSName.getPDFName(string); COSBase object = readObject(); dictionary.setItem(name, object); byte singleByte = consumeWhitespace(); if(singleByte == '>') { readByte(); // get rid of the second '>' atEndOfDictionary = true; } if(!atEndOfDictionary) string = readWord().trim(); } return dictionary; } else if(string.startsWith("/")) { // it's a dictionary label. i.e. /Type or /Pages or something similar COSBase name = COSName.getPDFName(string); return name; } else if(string.startsWith("-")) { // it's a negitive number return parseNumber(string); } else if(string.charAt(0) >= '0' && string.charAt(0) <= '9' ) { // it's a COSInt or COSFloat, or a weak reference (i.e. "3 0 R") // we'll have to peek ahead a little to see if it's a reference or not long tempOffset = this.currentOffset; consumeWhitespace(); String tempString = readWord(); if(tempString.matches("^[0-9]+$")) { // it is an int, might be a weak reference... tempString = readWord(); if(!"R".equals(tempString)) { // it's just a number, not a weak reference this.currentOffset = tempOffset; return parseNumber(string); } } else { // it's just a number, not a weak reference this.currentOffset = tempOffset; return parseNumber(string); } // it wasn't a number, so we need to parse the weak-reference this.currentOffset = tempOffset; int number = Integer.parseInt(string); int gen = readInt(); String r = readWord(); if(!"R".equals(r)) if(throwNonConformingException) throw new AssertionError("Expected keyword 'R' but found " + r); if(recursivlyRead) { // seek to the object, read it, seek back to current location long tempLocation = this.currentOffset; this.currentOffset = this.xrefEntries.get(number).getByteOffset(); COSBase returnValue = readObject(number, gen); this.currentOffset = tempLocation; return returnValue; } else { // Put a COSUnknown there as a placeholder COSObject obj = new COSObject(new COSUnread()); obj.setObjectNumber(COSInteger.get(number)); obj.setGenerationNumber(COSInteger.get(gen)); return obj; } } else if(string.startsWith("]")) { // end of an array, just return null if("]".equals(string)) return null; int oldLength = string.length(); this.currentOffset -= oldLength; return null; } else if(string.startsWith("[")) { // array of values // we'll just pay attention to the first part (this is in case there // is no whitespace between the "[" and the first element) int oldLength = string.length(); string = "["; this.currentOffset -= (oldLength - string.length() + 1); COSArray array = new COSArray(); COSBase object = readObject(); while(object != null) { array.add(object); object = readObject(); } return array; } else if(string.startsWith("(")) { // this is a string (not hex encoded), strip off the '(' and read until ')' StringBuilder sb = new StringBuilder(string.substring(1)); byte singleByte = readByte(); while(singleByte != ')') { sb.append((char)singleByte); singleByte = readByte(); } return new COSString(sb.toString()); } else { throw new RuntimeException("Not yet implemented: " + string + " loation=" + this.currentOffset); } } /** * This will read the next string from the stream. * @return The string that was read from the stream. * @throws IOException If there is an error reading from the stream. */ @Override protected String readString() throws IOException { consumeWhitespace(); StringBuilder buffer = new StringBuilder(); int c = pdfSource.read(); while(!isEndOfName((char)c) && !isClosing(c) && c != -1) { buffer.append( (char)c ); c = pdfSource.read(); } if (c != -1) { pdfSource.unread(c); } return buffer.toString(); } protected COSDictionary readDictionaryBackwards() throws IOException { COSDictionary dict = new COSDictionary(); // consume the last two '>' chars which signify the end of the dictionary consumeWhitespaceBackwards(); byte singleByte = readByteBackwards(); if(throwNonConformingException) { if(singleByte != '>') throw new AssertionError(""); } singleByte = readByteBackwards(); if(throwNonConformingException) { if(singleByte != '>') throw new AssertionError(""); } // check to see if we're at the end of the dictionary boolean atEndOfDictionary = false; singleByte = consumeWhitespaceBackwards(); if(singleByte == '<') { inputFile.seek(currentOffset-1); atEndOfDictionary = ((byte)inputFile.read()) == '<'; } COSDictionary backwardsDictionary = new COSDictionary(); // while we're not at the end of the dictionary, read in entries while(!atEndOfDictionary) { COSBase object = readObjectBackwards(); COSName name = readNameBackwards(); backwardsDictionary.setItem(name, object); singleByte = consumeWhitespaceBackwards(); if(singleByte == '<') { inputFile.seek(currentOffset-1); atEndOfDictionary = ((byte)inputFile.read()) == '<'; } } // the dictionaries preserve the order keys were added, as such we shall // add them in the proper order, not the reverse order Set<COSName> backwardsKeys = backwardsDictionary.keySet(); for(int i = backwardsKeys.size()-1; i >=0; i--) dict.setItem((COSName)backwardsKeys.toArray()[i], backwardsDictionary.getItem((COSName)backwardsKeys.toArray()[i])); // consume the last two '<' chars readByteBackwards(); readByteBackwards(); return dict; } /** * This will read a line starting with the byte at offset and going * backwards until it finds a newline. This should only be used if we are * certain that the data will only be text, and not binary data. * * @param offset the location of the file where we should start reading * @return the string which was read * @throws IOException if there was an error reading data from the file */ protected String readLineBackwards() throws IOException { StringBuilder sb = new StringBuilder(); boolean endOfObject = false; do { // first we read the %%EOF marker byte singleByte = readByteBackwards(); if(singleByte == '\n') { // if ther's a preceeding \r, we'll eat that as well inputFile.seek(currentOffset); if((byte)inputFile.read() == '\r') currentOffset--; endOfObject = true; } else if(singleByte == '\r') { endOfObject = true; } else { sb.insert(0, (char)singleByte); } } while(!endOfObject); return sb.toString(); } /** * This will read a line starting with the byte at offset and going * forward until it finds a newline. This should only be used if we are * certain that the data will only be text, and not binary data. * @param offset the location of the file where we should start reading * @return the string which was read * @throws IOException if there was an error reading data from the file */ @Override protected String readLine() throws IOException { StringBuilder sb = new StringBuilder(); boolean endOfLine = false; do { // first we read the %%EOF marker byte singleByte = readByte(); if(singleByte == '\n') { // if ther's a preceeding \r, we'll eat that as well inputFile.seek(currentOffset); if((byte)inputFile.read() == '\r') currentOffset++; endOfLine = true; } else if(singleByte == '\r') { endOfLine = true; } else { sb.append((char)singleByte); } } while(!endOfLine); return sb.toString(); } protected String readWord() throws IOException { StringBuilder sb = new StringBuilder(); boolean stop = true; do { byte singleByte = readByte(); stop = this.isWhitespace(singleByte); // there are some additional characters which indicate the next element/word has begun // ignore the first char we read, b/c the first char is the beginnging of this object, not the next one if(!stop && sb.length() > 0) { stop = singleByte == '/' || singleByte == '[' || singleByte == ']' || (singleByte == '>' && !">".equals(sb.toString())); if(stop) // we're stopping on a non-whitespace char, decrement the this.currentOffset--; // counter so we don't miss this character } if(!stop) sb.append((char)singleByte); } while(!stop); return sb.toString(); } /** * @return the recursivlyRead */ public boolean isRecursivlyRead() { return recursivlyRead; } /** * @param recursivlyRead the recursivlyRead to set */ public void setRecursivlyRead(boolean recursivlyRead) { this.recursivlyRead = recursivlyRead; } }