/* * Copyright 2006-2012 ICEsoft Technologies Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.icepdf.core.util; import org.icepdf.core.exceptions.PDFException; import org.icepdf.core.io.ConservativeSizingByteArrayOutputStream; import org.icepdf.core.io.SeekableByteArrayInputStream; import org.icepdf.core.io.SeekableInput; import org.icepdf.core.io.SeekableInputConstrainedWrapper; import org.icepdf.core.pobjects.*; import org.icepdf.core.pobjects.annotations.Annotation; import org.icepdf.core.pobjects.fonts.FontDescriptor; import org.icepdf.core.pobjects.fonts.FontFactory; import org.icepdf.core.pobjects.graphics.TilingPattern; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Hashtable; import java.util.Stack; import java.util.Vector; import java.util.logging.Logger; import java.util.logging.Level; /** * put your documentation comment here */ public class Parser { private static final Logger logger = Logger.getLogger(Parser.class.toString()); public static final int PARSE_MODE_NORMAL = 0; public static final int PARSE_MODE_OBJECT_STREAM = 1; // InputStream has to support mark(), reset(), and markSupported() // DO NOT close this, since we have two cases: read everything up front, and progressive reads private InputStream reader; boolean lastTokenHString = false; private Stack<Object> stack = new Stack<Object>(); private int parseMode; public Parser(SeekableInput r) { this(r, PARSE_MODE_NORMAL); } public Parser(SeekableInput r, int pm) { reader = r.getInputStream(); parseMode = pm; } public Parser(InputStream r) { this(r, PARSE_MODE_NORMAL); } public Parser(InputStream r, int pm) { reader = new BufferedInputStream(r); parseMode = pm; } /** * Get an object from the pdf input DataInputStream. * * @param library all found objects in the pdf document * @return the next object in the DataInputStream. Null is returned * if there are no more objects left in the DataInputStream or * a I/O error is encountered. * @throws PDFException error getting object from library */ public Object getObject(Library library) throws PDFException { int deepnessCount = 0; boolean inObject = false; // currently parsing tokens in an object boolean complete = false; // flag used for do loop. Object nextToken; Reference objectReference = null; try { do { //while (!complete); // keep track of currently parsed objects reference // get the next token inside the object stream try { nextToken = getToken(); //System.out.println("Parser.getObject() nextToken: " + nextToken); // commented out for performance reasons //Thread.yield(); } catch (IOException e) { // eat it as it is what is expected // if (Debug.ex){ // Debug.ex(e); // } return null; } // check for specific primative object types returned by getToken() if (nextToken instanceof StringObject || nextToken instanceof Name || nextToken instanceof Number) { // Very Important, store the PDF object reference information, // as it is needed when to decrypt an encrypted string. if (nextToken instanceof StringObject) { StringObject tmp = (StringObject) nextToken; tmp.setReference(objectReference); } stack.push(nextToken); } // mark that we have entered a object declaration else if (nextToken.equals("obj")) { // Since we can return objects on "endstream", then we can // leave straggling "endobj", which would deepnessCount--, // even though they're done in a separate method invocation // Hence, "obj" does /deepnessCount = 1/ instead of /deepnessCount++/ deepnessCount = 1; inObject = true; Number generationNumber = (Number) (stack.pop()); Number objectNumber = (Number) (stack.pop()); objectReference = new Reference(objectNumber, generationNumber); } // mark that we have reached the end of the object else if (nextToken.equals("endobj")) { deepnessCount--; //System.out.println("Parser.getObject() endobj objectReference: " + objectReference + " deepnessCount: " + deepnessCount); if (inObject) { // set flag to false, as we are done parsing an Object inObject = false; // return PObject, return addPObject(library, objectReference); // else, we ignore as the endStream token also returns a // PObject. } else return null; } // found endstream object, we will return the PObject containing // the stream as there can be no further tokens. This addresses // an incorrect a syntax error with OpenOffice document where // the endobj tag is missing on some Stream objects. else if (nextToken.equals("endstream")) { deepnessCount--; // do nothing, but don't add it to the stack if (inObject) { inObject = false; // return PObject, return addPObject(library, objectReference); } } // found a stream object, streams are allways defined inside // of a object so we will always have a dictionary (hash) that // has the length and filter definitions in it else if (nextToken.equals("stream")) { //System.out.println("Parser.getObject() stream"); deepnessCount++; // pop dictionary that defines the stream Hashtable streamHash = (Hashtable) stack.pop(); //System.out.println("Parser.getObject() stream streamHash: " + streamHash); // find the length of the stream int streamLength = library.getInt(streamHash, "Length"); //System.out.println("Parser.getObject() stream streamLength: " + streamLength); SeekableInputConstrainedWrapper streamInputWrapper; try { // a stream token's end of line marker can be either: // - a carriage return and a line feed // - just a line feed, and not by a carriage return alone. /* reader.mark(5); byte[] charBuffer = new byte[5]; reader.read(charBuffer); System.out.println("looking at " + objectReference + " " + streamHash); System.out.println("Stream bytes " + charBuffer[0] + " " + charBuffer[1] + " " + charBuffer[2] + " " + charBuffer[3] + " " + charBuffer[4]); reader.reset(); */ // check for carage return and line feed, but reset if // just a carriage return as it is a valid stream byte reader.mark(2); // alway eat a 13,against the spec but we have several examples of this. int curChar = reader.read(); if (curChar == 13) { reader.mark(1); if (reader.read() != 10) { reader.reset(); } } // always eat a 10 else if (curChar == 10) { // eat the stream character } // reset the rest else { reader.reset(); } /* reader.mark(5); charBuffer = new byte[5]; reader.read(charBuffer); System.out.println("Stream bytes " + charBuffer[0] + " " + charBuffer[1] + " " + charBuffer[2] +" " + charBuffer[3] + " " + charBuffer[4]); reader.reset(); */ if (reader instanceof SeekableInput) { SeekableInput streamDataInput = (SeekableInput) reader; long filePositionOfStreamData = streamDataInput.getAbsolutePosition(); long lengthOfStreamData; // If the stream has a length that we can currently use // such as a R that has been parsed or an integer if (streamLength > 0) { lengthOfStreamData = streamLength; streamDataInput.seekRelative(streamLength); // Read any extraneous data coming after the length, but before endstream // long skipped = skipUntilEndstream( null ); lengthOfStreamData += skipUntilEndstream(null); } else { lengthOfStreamData = captureStreamData(null); } streamInputWrapper = new SeekableInputConstrainedWrapper( streamDataInput, filePositionOfStreamData, lengthOfStreamData, false); } else { // reader is just regular InputStream (BufferedInputStream) //System.out.println("Parser.getObject() stream NOT SeekableInput"); ConservativeSizingByteArrayOutputStream out; // If the stream in from a regular InputStream, // then the PDF was probably linearly traversed, // in which case it doesn't matter if they have // specified the stream length, because we can't // trust that anyway //System.out.println("Parser.getObject() stream NOT SeekableInput linear traversal: " + library.isLinearTraversal()); if (!library.isLinearTraversal() && streamLength > 0) { byte[] buffer = new byte[streamLength]; int totalRead = 0; while (totalRead < buffer.length) { int currRead = reader.read(buffer, totalRead, buffer.length - totalRead); //System.out.println("Parser.getObject() stream NOT SeekableInput currRead: " + currRead); //String s = new String(buffer, totalRead, currRead); //System.out.println(s); if (currRead <= 0) break; totalRead += currRead; //System.out.println("Parser.getObject() stream NOT SeekableInput totalRead: " + totalRead); } out = new ConservativeSizingByteArrayOutputStream( buffer, library.memoryManager); // Read any extraneous data coming after the length, but before endstream // long skipped = skipUntilEndstream( out ); skipUntilEndstream(out); } // if stream doesn't have a length, read the stream // until end stream has been found else { //System.out.println("Parser.getObject() stream NOT SeekableInput No trusted streamLength"); out = new ConservativeSizingByteArrayOutputStream( 16 * 1024, library.memoryManager); captureStreamData(out); } int size = out.size(); out.trim(); byte[] buffer = out.relinquishByteArray(); SeekableInput streamDataInput = new SeekableByteArrayInputStream(buffer); long filePositionOfStreamData = 0L; long lengthOfStreamData = size; streamInputWrapper = new SeekableInputConstrainedWrapper( streamDataInput, filePositionOfStreamData, lengthOfStreamData, true); } } catch (IOException e) { return null; } PTrailer trailer = null; // set the stream know objects if possible Stream stream = null; //Hashtable streamHash1 = (Hashtable) stack.pop(); Name type = (Name) library.getObject(streamHash, "Type"); Name subtype = (Name) library.getObject(streamHash, "Subtype"); if (type != null) { // new Tiling Pattern Object, will have a stream. if (type.equals("Pattern")) { stream = new TilingPattern(library, streamHash, streamInputWrapper); } else if (type.equals("XRef")) { stream = new Stream(library, streamHash, streamInputWrapper); stream.init(); InputStream in = stream.getInputStreamForDecodedStreamBytes(); CrossReference xrefStream = new CrossReference(); if (in != null) { try { xrefStream.addXRefStreamEntries(library, streamHash, in); } finally { try { in.close(); } catch (IOException e) { logger.log(Level.FINE, "Error appending stream entries.", e); } } } stream.dispose(false); // XRef dict is both Trailer dict and XRef stream dict. // PTrailer alters its dict, so copy it to keep everything sane Hashtable trailerHash = (Hashtable) streamHash.clone(); trailer = new PTrailer(library, trailerHash, null, xrefStream); } else if (type.equals("ObjStm")) { stream = new ObjectStream(library, streamHash, streamInputWrapper); } } if (subtype != null) { // new form object if (subtype.equals("Form") && !"pattern".equals(type)) { stream = new Form(library, streamHash, streamInputWrapper); } } if (trailer != null) { stack.push(trailer); } else { // finally create a generic stream object which will be parsed // at a later time if (stream == null) { stream = new Stream(library, streamHash, streamInputWrapper); } stack.push(stream); } } // end if (stream) // boolean objects are added to stack else if (nextToken.equals("true")) { stack.push(new Boolean(true)); } else if (nextToken.equals("false")) { stack.push(new Boolean(false)); } // Indirect Reference object found else if (nextToken.equals("R")) { // generationNumber number important for revisions Number generationNumber = (Number) (stack.pop()); Number objectNumber = (Number) (stack.pop()); stack.push(new Reference(objectNumber, generationNumber)); } else if (nextToken.equals("[")) { deepnessCount++; stack.push(nextToken); } // Found an array else if (nextToken.equals("]")) { deepnessCount--; final int searchPosition = stack.search("["); final int size = searchPosition - 1; Vector v = new Vector(size > 0 ? size : 1); if (size > 0) v.setSize(size); if (searchPosition > 0) { for (int i = size-1; i >= 0; i--) { Object obj = stack.pop(); v.set(i, obj); } stack.pop(); // "[" } else { stack.clear(); } stack.push(v); } else if (nextToken.equals("<<")) { //System.out.println("Parser.getObject() << deepnessCount: " + deepnessCount + " -> " + (deepnessCount+1)); deepnessCount++; stack.push(nextToken); } // Found a Dictionary else if (nextToken.equals(">>")) { //System.out.println("Parser.getObject() >> deepnessCount: " + deepnessCount + " -> " + (deepnessCount-1)); deepnessCount--; Hashtable hashTable = new Hashtable(); //System.out.println("Parser.getObject() >> stack.empty: " + stack.isEmpty()); if (!stack.isEmpty()) { Object obj = stack.pop(); // put all of the dictionary definistion into the // the hashTabl while (!((obj instanceof String) && (obj.equals("<<"))) && !stack.isEmpty()) { Object key = stack.pop(); //System.out.println("Parser.getObject() >> key: " + key); //System.out.println("Parser.getObject() >> value: " + obj); hashTable.put(key, obj); if (!stack.isEmpty()) { obj = stack.pop(); } else { break; } } obj = hashTable.get("Type"); //System.out.println("Parser.getObject() >> Type: " + obj); // Process the know first level dictionaries. if (obj != null && obj instanceof Name) { Name n = (Name) obj; //System.out.println("Parser.getObject() >> Name: " + n); if (n.equals("Catalog")) { stack.push(new Catalog(library, hashTable)); } else if (n.equals("Pages")) { stack.push(new PageTree(library, hashTable)); } else if (n.equals("Page")) { stack.push(new Page(library, hashTable)); } else if (n.equals("Font")) { stack.push(FontFactory.getInstance() .getFont(library, hashTable)); } else if (n.equals("FontDescriptor")) { stack.push(new FontDescriptor(library, hashTable)); } else if (n.equals("CMap")) { stack.push(hashTable); } else if (n.equals("Annot")) { stack.push(Annotation.buildAnnotation(library, hashTable)); } else stack.push(hashTable); } // everything else gets pushed onto the stack else { //System.out.println("Parser.getObject() >> Not Name"); stack.push(hashTable); } //System.out.println("Parser.getObject() >> deepnessCount: " + deepnessCount); if (deepnessCount == 0) return stack.pop(); } } // end of if >> (dictionary // // read encryp information // if (startxrefDictionary.containsKey("Encrypt")) { // // // read ID information needed for encryption // Vector fileID = null; // if (startxrefDictionary.containsKey("ID")){ // // get the files identifier vector // fileID = (Vector)startxrefDictionary.get("ID"); // } // // // Try and find encrypt dictionary // Object encrypt = startxrefDictionary.get("Encrypt"); // System.out.println(encrypt.getClass()); // if (encrypt instanceof Reference ){ // Reference encryptReference = (Reference)encrypt; // SecurityManager securityManager = // new SecurityManager (library, // encryptReference, // fileID); // } // else if (encrypt instanceof Dictionary){ // // // } // // // initiate the security manager. // //org.icepdf.core.pobjects.security.SecurityManager.getInstance(); // } else if (nextToken.equals("xref")) { //System.out.println("xref found"); CrossReference xrefTable = new CrossReference(); xrefTable.addXRefTableEntries(this); stack.push(xrefTable); } else if (nextToken.equals("trailer")) { CrossReference xrefTable = null; if (stack.peek() instanceof CrossReference) xrefTable = (CrossReference) stack.pop(); stack.clear(); Hashtable trailerDictionary = (Hashtable) getObject(library); //System.out.println("trailer"); //System.out.println(" trailerDictionary: " + trailerDictionary); //System.out.println(" xref table: " + xrefTable); return new PTrailer(library, trailerDictionary, xrefTable, null); } // comments else if (nextToken instanceof String && ((String) nextToken).startsWith("%")) { // Comment, ignored for now } // everything else gets pushed onto the stack else { stack.push(nextToken); } if (parseMode == PARSE_MODE_OBJECT_STREAM && deepnessCount == 0 && stack.size() > 0) { return stack.pop(); } } while (!complete); } // catch (PDFSecurityException e) { // throw e; // } catch (Exception e) { logger.log(Level.FINE, "Fatal error parsing PDF file stream.", e); return null; } // return the top of the statck return stack.pop(); } /** * Utility Method for getting a PObject from the stack and adding it to the * library. The retrieved PObject has an ObjectReference added to it for * decryption purposes. * * @param library hashtable of all objects in document * @param objectReference PObjet indirect reference data * @return a valid PObject. */ public PObject addPObject(Library library, Reference objectReference) { Object o = stack.pop(); // Add the streams object reference which is needed for // decrypting encrypted streams if (o instanceof Stream) { Stream tmp = (Stream) o; tmp.setPObjectReference(objectReference); } // Add the dictionary object reference which is needed for // decrypting encrypted string contained in the dictionary else if (o instanceof Dictionary) { Dictionary tmp = (Dictionary) o; tmp.setPObjectReference(objectReference); } // the the object to the library library.addObject(o, objectReference); return new PObject(o, objectReference); } /** * Returns the next object found in a content stream. * * @return next object in the input stream * @throws java.io.IOException when the end of the <code>InputStream</code> * has been encountered. */ public Object getStreamObject() throws IOException { Object o = getToken(); if (o instanceof String) { if (o.equals("<<")) { Hashtable h = new Hashtable(); Object o1 = getStreamObject(); while (!o1.equals(">>")) { h.put(o1, getStreamObject()); o1 = getStreamObject(); } o = h; } // arrays are only used for CID mappings, the hex decoding is delayed // as a result using the CID_STREAM flag else if (o.equals("[")) { Vector v = new Vector(); Object o1 = getStreamObject(); while (!o1.equals("]")) { v.addElement(o1); o1 = getStreamObject(); } v.trimToSize(); o = v; } } //System.err.println("GET=" + o + " - " + o.getClass().getName()); return o; } /** * Utility method used to parse a valid pdf token from an DataIinputStream. * Each call to this method return one pdf token. The Reader object is * used to "mark" the location of the last "read". * * @return the next token in the pdf data stream * @throws java.io.IOException if an I/O error occurs. */ public Object getToken() throws IOException { int currentByte; char currentChar; boolean inString = false; // currently parsing a string boolean hexString = false; lastTokenHString = false; // strip all white space characters do { currentByte = reader.read(); // input stream interupted if (currentByte < 0) { throw new IOException(); } currentChar = (char) currentByte; } while (isWhitespace(currentChar)); /** * look the start of different primative pdf objects * ( - strints * [ - arrays * % - comments */ if (currentChar == '(') { // mark that we are currrently processing a string inString = true; } else if (currentChar == ']') { // fount end of an array return "]"; } else if (currentChar == '[') { // fount begining of an array return "["; } else if (currentChar == '%') { // ignore all the characters after a comment token until // we get to the end of the line StringBuilder stringBuffer = new StringBuilder(); do { stringBuffer.append(currentChar); currentByte = reader.read(); if (currentByte < 0) { // Final %%EOF might not have CR LF afterwards if (stringBuffer.length() > 0) return stringBuffer.toString(); throw new IOException(); } currentChar = (char) currentByte; } while (currentChar != 13 && currentChar != 10); // return all the text that is in the comment return stringBuffer.toString(); } // mark this location in the input stream reader.mark(1); // read the next char from the reader char nextChar = (char) reader.read(); // Check for dictionaries, start '<<' and end '>>' if (currentChar == '>' && nextChar == '>') { return ">>"; } if (currentChar == '<') { // if two "<<" then we have a dictionary if (nextChar == '<') { return "<<"; } // Otherwise we have a hex number else { inString = true; hexString = true; } } // return to the previous mark reader.reset(); // store the parsed char in the token buffer. StringBuilder stringBuffer = new StringBuilder(); stringBuffer.append(currentChar); /** * Finally parse the contents of a complex token */ int parenthesisCount = 0; boolean complete = false; // indicates that the current char should be ignored and not added to // the current string. boolean ignoreChar = false; do { // while !complete // if we are not parsing a string mark the location if (!inString) { reader.mark(1); } // PDF-215, try to sniff out missing space between tokens and numbers // in a content stream. The fix only addressed a character followed // by a number. It's legal for a /Name object to have mixed content // so we need to check for / at the start of the string. if ( !(inString || hexString) && currentChar != 'd' && currentChar > 65 && (nextChar >=48 && nextChar <= 57) && stringBuffer.charAt(0) != '/'){ reader.reset(); break; } // get the next byte and corresponding char currentByte = reader.read(); // if ther are no more bytes (-1) then we should return previous // stringBuffer value, otherwise the last grouping of tokens will // be ignored, which is very bad. if (currentByte >= 0) { currentChar = (char) currentByte; } else { return stringBuffer.toString(); } // if we are parsing a token that is a string, (...) if (inString) { if (hexString) { // found the end of a dictionary if (currentChar == '>') { complete = true; stringBuffer.append(currentChar); break; } } else { // look for embedded strings if (currentChar == '(') { parenthesisCount++; } if (currentChar == ')') { if (parenthesisCount == 0) { complete = true; stringBuffer.append(currentChar); break; } else { parenthesisCount--; } } // look for "\" character /** * The escape sequences can be as follows: * \n - line feed (LF) * \r - Carriage return (CR) * \t - Horizontal tab (HT) * \b - backspace (BS) * \f - form feed (FF) * \( - left parenthesis * \) - right parenthesis * \\ - backslash * \ddd - character code ddd (octal) * * Note: (\0053) denotes a string containing two characters, * \005 (Control-E) followed by the digit 3. */ if (currentChar == '\\') { // read next char currentChar = (char) reader.read(); // check for a digit, if so we have an octal // and we need to handle it correctly if (Character.isDigit(currentChar)) { // store the read digits StringBuilder digit = new StringBuilder(); digit.append(currentChar); // octals have a max size of 3 digits, we already // have one, so there can be up 2 more digits. for (int i = 0; i < 2; i++) { // mark the reader incase the next read is not // a digit. reader.mark(1); // read next char currentChar = (char) reader.read(); if (Character.isDigit(currentChar)) { digit.append(currentChar); } else { // back up the reader just incase // thre is only 1 or 2 digits in the octal reader.reset(); break; } } // finally convert digit to a character int charNumber = 0; try { charNumber = Integer.parseInt(digit.toString(), 8); } catch (NumberFormatException e) { logger.log(Level.FINE, "Integer parse error ", e); } // convert the interger from octal to dec. currentChar = (char) charNumber; } // do nothing else if (currentChar == '(' || currentChar == ')' || currentChar == '\\') { } // capture the horizontal tab (HT), tab character is hard // to find, only appears in files with font substitution and // as a result we ahve better luck drawing a space character. else if (currentChar == 't') { currentChar = '\t'; } // capture the carriage return (CR) else if (currentChar == 'r') { currentChar = '\r'; } // capture the line feed (LF) else if (currentChar == 'n') { currentChar = '\n'; } // capture the backspace (BS) else if (currentChar == 'b') { currentChar = '\b'; } // capture the form feed (FF) else if (currentChar == 'f') { currentChar = '\f'; } // ignor CF, which indicate a '\' lone split line token else if (currentChar == 13) { ignoreChar = true; } // otherwise report the file format error else { if (logger.isLoggable(Level.FINE)) { logger.warning("C=" + ((int) currentChar)); } } } } } // if we are not in a string definition we want to break // and return the current token, as white spaces or other elements // would mean that we are on the next token else if (isWhitespace(currentChar)) { // return stringBuffer.toString(); // we need to return the CR LR, as it is need by stream parsing if (currentByte == 13 || currentByte == 10) { reader.reset(); break; } // break on any whitespace else { // return stringBuffer.toString(); break; } } else if (isDelimiter(currentChar)) { // reset the reader so we start on this token on the next parse reader.reset(); break; } // append the current char and keep parsing if needed // IgnoreChar is set by the the line split char '\' if (!ignoreChar) { stringBuffer.append(currentChar); } // reset the ignorChar flag else { ignoreChar = false; } } while (!complete); /** * Return what we found */ // if a hex string decode it as needed if (hexString) { lastTokenHString = true; return new HexStringObject(stringBuffer); } // do a little clean up for any object that may have been missed.. // this mainly for the the document trailer information // a orphaned string if (inString) { return new LiteralStringObject(stringBuffer); } // return a new name else if (stringBuffer.charAt(0) == '/') { return new Name(stringBuffer.deleteCharAt(0)); } // if a number try and parse it else { boolean foundDigit = false; boolean foundDecimal = false; for (int i = stringBuffer.length() - 1; i >= 0; i--) { char curr = stringBuffer.charAt(i); if (curr == '.') foundDecimal = true; else if (curr >= '0' && curr <= '9') foundDigit = true; } // Only bother trying to interpret as a number if contains a digit somewhere, // to reduce NumberFormatExceptions if (foundDigit) { try { if (foundDecimal) return Float.valueOf(stringBuffer.toString()); else { return Integer.valueOf(stringBuffer.toString()); } } catch (NumberFormatException ex) { // Debug.trace("Number format exception " + ex); } } } return stringBuffer.toString(); } public Object getNumberOrStringWithMark(int maxLength) throws IOException { reader.mark(maxLength); StringBuilder sb = new StringBuilder(maxLength); boolean readNonWhitespaceYet = false; boolean foundDigit = false; boolean foundDecimal = false; for (int i = 0; i < maxLength; i++) { int curr = reader.read(); if (curr < 0) break; char currChar = (char) curr; if (isWhitespace(currChar)) { if (readNonWhitespaceYet) break; } else if (isDelimiter(currChar)) { // Number or string has delimiter immediately after it, // which we'll have to unread. // Had hoped it would be whitespace, so wouldn't have to unread reader.reset(); reader.mark(maxLength); for (int j = 0; j < i; j++) reader.read(); readNonWhitespaceYet = true; break; } else { readNonWhitespaceYet = true; if (currChar == '.') foundDecimal = true; else if (currChar >= '0' && curr <= '9') foundDigit = true; sb.append(currChar); } } // Only bother trying to interpret as a number if contains a digit somewhere, // to reduce NumberFormatExceptions if (foundDigit) { try { if (foundDecimal) return Float.valueOf(sb.toString()); else { return Integer.valueOf(sb.toString()); } } catch (NumberFormatException ex) { // Debug.trace("Number format exception " + ex); } } if (sb.length() > 0) return sb.toString(); return null; } public void ungetNumberOrStringWithReset() throws IOException { reader.reset(); } public int getIntSurroundedByWhitespace() { int num = 0; boolean makeNegative = false; boolean readNonWhitespace = false; try { while (true) { int curr = reader.read(); if (curr < 0) break; if (Character.isWhitespace((char) curr)) { if (readNonWhitespace) break; } else if (curr == '-') { makeNegative = true; readNonWhitespace = true; } else if (curr >= '0' && curr <= '9') { num *= 10; num += (curr - '0'); readNonWhitespace = true; } } } catch (IOException e) { logger.log(Level.FINE, "Error detecting int.", e); } if (makeNegative) num = num * -1; return num; } public long getLongSurroundedByWhitespace() { long num = 0L; boolean makeNegative = false; boolean readNonWhitespace = false; try { while (true) { int curr = reader.read(); if (curr < 0) break; if (Character.isWhitespace((char) curr)) { if (readNonWhitespace) break; } else if (curr == '-') { makeNegative = true; readNonWhitespace = true; } else if (curr >= '0' && curr <= '9') { num *= 10L; num += ((long) (curr - '0')); readNonWhitespace = true; } } } catch (IOException e) { logger.log(Level.FINE, "Error detecting long.", e); } if (makeNegative) num = num * -1L; return num; } public char getCharSurroundedByWhitespace() { char alpha = 0; try { while (true) { int curr = reader.read(); if (curr < 0) break; char c = (char) curr; if (!Character.isWhitespace(c)) { alpha = c; break; } } } catch (IOException e) { logger.log(Level.FINE, "Error detecting char.", e); } return alpha; } int hexToInt(String hex) { hex = hex.substring(1, hex.length() - 1).toUpperCase(); return Integer.parseInt(hex, 16 /* radix */); } /** * @param hh */ String hexToString(String hh) { hh = hh.substring(1, hh.length() - 1).toUpperCase(); StringBuilder sb = new StringBuilder(); if (hh.charAt(0) == 'F' && hh.charAt(1) == 'E' && hh.charAt(2) == 'F' && hh.charAt(3) == 'F') { byte b[] = new byte[4]; for (int i = 1; i < hh.length() / 4; i++) { b[0] = (byte) hh.charAt(i * 4); b[1] = (byte) hh.charAt(i * 4 + 1); b[2] = (byte) hh.charAt(i * 4 + 2); b[3] = (byte) hh.charAt(i * 4 + 3); sb.append((char) Integer.parseInt(new String(b), 16)); } } else { byte b[] = new byte[2]; for (int i = 0; i < hh.length() / 2; i++) { try { b[0] = (byte) hh.charAt(i * 2); b[1] = (byte) hh.charAt(i * 2 + 1); sb.append((char) Short.parseShort(new String(b), 16)); } catch (Exception e) { } } } return sb.toString(); } /** * @return true if ate the ending EI delimiter * @throws java.io.IOException */ boolean readLineForInlineImage(OutputStream out) throws IOException { // The encoder might not have put EI on its own line (as it should), // but might just put it right after the data final int STATE_PRE_E = 0; final int STATE_PRE_I = 1; final int STATE_PRE_WHITESPACE = 2; int state = STATE_PRE_E; while (true) { int c = reader.read(); if (c < 0) break; if (state == STATE_PRE_E && c == 'E') { state++; continue; } else if (state == STATE_PRE_I && c == 'I') { state++; continue; } else if (state == STATE_PRE_WHITESPACE && isWhitespace((char) (0xFF & c))) { // It's hard to tell if the EI + whitespace is part of the // image data or not, given that many PDFs are mis-encoded, // and don't give whitespace when necessary. So, instead of // assuming the need for whitespace, we're going to assume // that this is the real EI, and apply a heuristic to prove // ourselves wrong. boolean imageDataFound = isStillInlineImageData(reader, 32); if (imageDataFound) { out.write('E'); out.write('I'); out.write(c); state = STATE_PRE_E; if (c == '\r' || c == '\n') { break; } } else return true; } else { // If we got a fragment of the EI<whitespace> sequence, then we withheld // what we had so far. But if we're here, that fragment was incomplete, // so that was actual embedded data, and not the delimiter, so we have // to write it out. if (state > STATE_PRE_E) out.write('E'); if (state > STATE_PRE_I) out.write('I'); state = STATE_PRE_E; out.write((byte) c); if (c == '\r' || c == '\n') { break; } } } // If the input ends right after the EI, but with no whitespace, // then we're still done if (state == STATE_PRE_WHITESPACE) return true; return false; } /** * @return * @throws java.io.IOException */ byte readByte() throws IOException { //return reader.readByte(); return (byte) reader.read(); } /** * White space characters defined by ' ', '\t', '\r', '\n', '\f' * * @param c */ public static final boolean isWhitespace(char c) { return ((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n') || (c == '\f')); } private static final boolean isDelimiter(char c) { return ((c == '[') || (c == ']') || (c == '(') || (c == ')') || (c == '<') || (c == '>') || (c == '{') || (c == '}') || (c == '/') || (c == '%')); } /** * This is not necessarily an exhaustive list of characters one would * expect in a Content Stream, it's a heuristic for whether the data * might still be part of an inline image, or the lattercontent stream */ private static boolean isExpectedInContentStream(char c) { return ((c >= 'a' && c <= 'Z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || isWhitespace(c) || isDelimiter(c) || (c == '\\') || (c == '\'') || (c == '\"') || (c == '*') || (c == '.')); } /** * We want to be conservative in deciding that we're still in the inline * image, since we haven't found any of these cases before now. */ private static boolean isStillInlineImageData( InputStream reader, int numBytesToCheck) throws IOException { boolean imageDataFound = false; boolean onlyWhitespaceSoFar = true; reader.mark(numBytesToCheck); byte[] toCheck = new byte[numBytesToCheck]; int numReadToCheck = reader.read(toCheck); for (int i = 0; i < numReadToCheck; i++) { char charToCheck = (char) (((int) toCheck[i]) & 0xFF); // If the very first thing we read is a Q or S token boolean typicalTextTokenInContentStream = (charToCheck == 'Q' || charToCheck == 'q' || charToCheck == 'S' || charToCheck == 's'); if (onlyWhitespaceSoFar && typicalTextTokenInContentStream && (i + 1 < numReadToCheck) && isWhitespace((char) (((int) toCheck[i + 1]) & 0xFF))) { break; } if (!isWhitespace(charToCheck)) onlyWhitespaceSoFar = false; // If we find some binary image data if (!isExpectedInContentStream(charToCheck)) { imageDataFound = true; break; } } reader.reset(); return imageDataFound; } /** * @return * @throws java.io.IOException */ String peek2() throws IOException { reader.mark(2); char c[] = new char[2]; c[0] = (char) reader.read(); c[1] = (char) reader.read(); String s = new String(c); reader.reset(); return s; } private long captureStreamData(OutputStream out) throws IOException { long numBytes = 0; while (true) { // read bytes int nextByte = reader.read(); // look to see if we have the ending tag if (nextByte == 'e') { reader.mark(10); if (reader.read() == 'n' && reader.read() == 'd' && reader.read() == 's' && reader.read() == 't' && reader.read() == 'r' && reader.read() == 'e' && reader.read() == 'a' && reader.read() == 'm') { break; } else { reader.reset(); } } else if (nextByte < 0) break; // write the bytes if (out != null) out.write(nextByte); numBytes++; } return numBytes; } private long skipUntilEndstream(OutputStream out) throws IOException { long skipped = 0L; while (true) { reader.mark(10); // read bytes int nextByte = reader.read(); if (nextByte == 'e' && reader.read() == 'n' && reader.read() == 'd' && reader.read() == 's' && reader.read() == 't' && reader.read() == 'r' && reader.read() == 'e' && reader.read() == 'a' && reader.read() == 'm') { reader.reset(); break; } else if (nextByte < 0) break; else { if (nextByte == 0x0A || nextByte == 0x0D || nextByte == 0x20) continue; if (out != null) out.write(nextByte); } skipped++; } return skipped; } }