/* * Copyright 2006-2017 ICEsoft Technologies Canada Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.icepdf.core.util; import org.icepdf.core.exceptions.PDFException; import org.icepdf.core.io.*; import org.icepdf.core.pobjects.*; import org.icepdf.core.pobjects.annotations.Annotation; import org.icepdf.core.pobjects.fonts.CMap; import org.icepdf.core.pobjects.fonts.Font; import org.icepdf.core.pobjects.fonts.FontDescriptor; import org.icepdf.core.pobjects.fonts.FontFactory; import org.icepdf.core.pobjects.graphics.TilingPattern; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Stack; import java.util.logging.Level; import java.util.logging.Logger; /** * put your documentation comment here */ public class Parser { private static final Logger logger = Logger.getLogger(Parser.class.toString()); public static final int PARSE_MODE_NORMAL = 0; public static final int PARSE_MODE_OBJECT_STREAM = 1; // InputStream has to support mark(), reset(), and markSupported() // DO NOT close this, since we have two cases: read everything up front, and progressive reads // private BufferedMarkedInputStream reader; private InputStream reader; boolean lastTokenHString = false; private Stack<Object> stack = new Stack<Object>(); private int parseMode; private boolean isTrailer; private int linearTraversalOffset; public Parser(SeekableInput r) { this(r, PARSE_MODE_NORMAL); } public Parser(SeekableInput r, int pm) { // reader = new BufferedMarkedInputStream(r.getInputStream()); reader = r.getInputStream(); parseMode = pm; } public Parser(InputStream r) { this(r, PARSE_MODE_NORMAL); } public Parser(InputStream r, int pm) { reader = new BufferedMarkedInputStream(r); parseMode = pm; } /** * Get an object from the pdf input DataInputStream. * * @param library all found objects in the pdf document * @return the next object in the DataInputStream. Null is returned * if there are no more objects left in the DataInputStream or * a I/O error is encountered. * @throws PDFException error getting object from library */ public Object getObject(Library library) throws PDFException { int deepnessCount = 0; boolean inObject = false; // currently parsing tokens in an object boolean complete = false; // flag used for do loop. Object nextToken; Reference objectReference = null; try { reader.mark(1); // capture the byte offset of this object so we can rebuild // the cross reference entries for lazy loading after CG. if (library.isLinearTraversal() && reader instanceof BufferedMarkedInputStream) { linearTraversalOffset = ((BufferedMarkedInputStream) reader).getMarkedPosition(); } do { //while (!complete); // keep track of currently parsed objects reference // get the next token inside the object stream try { nextToken = getToken(); // commented out for performance reasons //Thread.yield(); } catch (IOException e) { // eat it as it is what is expected // logger.warning("IO reading error."); return null; } // check for specific primative object types returned by getToken() if (nextToken instanceof StringObject || nextToken instanceof Name || nextToken instanceof Number) { // Very Important, store the PDF object reference information, // as it is needed when to decrypt an encrypted string. if (nextToken instanceof StringObject) { StringObject tmp = (StringObject) nextToken; tmp.setReference(objectReference); } stack.push(nextToken); } // mark that we have entered a object declaration else if (nextToken.equals("obj")) { // a rare parsing error is that endobj is missing, so we need // to make sure if an object has been parsed that we don't loose it. if (inObject) { // pop off the object and ref number stack.pop(); stack.pop(); // return the passed over object on the stack. return addPObject(library, objectReference); } // Since we can return objects on "endstream", then we can // leave straggling "endobj", which would deepnessCount--, // even though they're done in a separate method invocation // Hence, "obj" does /deepnessCount = 1/ instead of /deepnessCount++/ deepnessCount = 0; inObject = true; Number generationNumber = (Number) (stack.pop()); Number objectNumber = (Number) (stack.pop()); objectReference = new Reference(objectNumber, generationNumber); } // mark that we have reached the end of the object else if (nextToken.equals("endobj") || nextToken.equals("endobject") || nextToken.equals("enbobj")) { if (inObject) { // set flag to false, as we are done parsing an Object inObject = false; // return PObject, return addPObject(library, objectReference); // else, we ignore as the endStream token also returns a // PObject. } else { // return null; } } // found endstream object, we will return the PObject containing // the stream as there can be no further tokens. This addresses // an incorrect a syntax error with OpenOffice document where // the endobj tag is missing on some Stream objects. else if (nextToken.equals("endstream")) { deepnessCount--; // do nothing, but don't add it to the stack if (inObject) { inObject = false; // return PObject, return addPObject(library, objectReference); } } // found a stream object, streams are allways defined inside // of a object so we will always have a dictionary (hash) that // has the length and filter definitions in it else if (nextToken.equals("stream")) { deepnessCount++; // pop dictionary that defines the stream Object tmp = stack.pop(); HashMap streamHash; if (tmp instanceof Dictionary) { streamHash = ((Dictionary) tmp).getEntries(); } else { streamHash = (HashMap) tmp; } // find the length of the stream int streamLength = library.getInt(streamHash, Dictionary.LENGTH_KEY); SeekableInputConstrainedWrapper streamInputWrapper; try { // a stream token's end of line marker can be either: // - a carriage return and a line feed // - just a line feed, and not by a carriage return alone. // check for carriage return and line feed, but reset if // just a carriage return as it is a valid stream byte reader.mark(2); // alway eat a 13,against the spec but we have several examples of this. int curChar = reader.read(); if (curChar == 13) { reader.mark(1); if (reader.read() != 10) { reader.reset(); } } // always eat a 10 else if (curChar == 10) { // eat the stream character } // reset the rest else { reader.reset(); } if (reader instanceof SeekableInput) { SeekableInput streamDataInput = (SeekableInput) reader; long filePositionOfStreamData = streamDataInput.getAbsolutePosition(); long lengthOfStreamData; // If the stream has a length that we can currently use // such as a R that has been parsed or an integer if (streamLength > 0) { lengthOfStreamData = streamLength; streamDataInput.seekRelative(streamLength); // Read any extraneous data coming after the length, but before endstream lengthOfStreamData += skipUntilEndstream(null); } else { lengthOfStreamData = captureStreamData(null); } streamInputWrapper = new SeekableInputConstrainedWrapper( streamDataInput, filePositionOfStreamData, lengthOfStreamData); } else { // reader is just regular InputStream (BufferedInputStream) // stream NOT SeekableInput ConservativeSizingByteArrayOutputStream out; // If the stream in from a regular InputStream, // then the PDF was probably linearly traversed, // in which case it doesn't matter if they have // specified the stream length, because we can't // trust that anyway if (!library.isLinearTraversal() && streamLength > 0) { byte[] buffer = new byte[streamLength]; int totalRead = 0; while (totalRead < buffer.length) { int currRead = reader.read(buffer, totalRead, buffer.length - totalRead); if (currRead <= 0) break; totalRead += currRead; } out = new ConservativeSizingByteArrayOutputStream( buffer); // Read any extraneous data coming after the length, but before endstream skipUntilEndstream(out); } // if stream doesn't have a length, read the stream // until end stream has been found else { // stream NOT SeekableInput No trusted streamLength"); out = new ConservativeSizingByteArrayOutputStream( 16 * 1024); captureStreamData(out); } int size = out.size(); out.trim(); byte[] buffer = out.relinquishByteArray(); SeekableInput streamDataInput = new SeekableByteArrayInputStream(buffer); long filePositionOfStreamData = 0L; streamInputWrapper = new SeekableInputConstrainedWrapper( streamDataInput, filePositionOfStreamData, size); } } catch (IOException e) { if (logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "Error getting next object", e); } return null; } PTrailer trailer = null; // set the stream know objects if possible Stream stream = null; Name type = (Name) library.getObject(streamHash, Dictionary.TYPE_KEY); Name subtype = (Name) library.getObject(streamHash, Dictionary.SUBTYPE_KEY); if (type != null) { // found a xref stream which is made up it's own entry format // different then an standard xref table, mainly used to // access cross-reference entries but also to compress xref tables. if (type.equals("XRef")) { stream = new Stream(library, streamHash, streamInputWrapper); stream.init(); InputStream in = stream.getDecodedByteArrayInputStream(); CrossReference xrefStream = new CrossReference(); if (in != null) { try { xrefStream.addXRefStreamEntries(library, streamHash, in); } finally { try { in.close(); } catch (Throwable e) { logger.log(Level.WARNING, "Error appending stream entries.", e); } } } // XRef dict is both Trailer dict and XRef stream dict. // PTrailer alters its dict, so copy it to keep everything sane HashMap trailerHash = (HashMap) streamHash.clone(); trailer = new PTrailer(library, trailerHash, null, xrefStream); } else if (type.equals("ObjStm")) { stream = new ObjectStream(library, streamHash, streamInputWrapper); } else if (type.equals("XObject") && subtype.equals("Image")) { stream = new ImageStream(library, streamHash, streamInputWrapper); } // new Tiling Pattern Object, will have a stream. else if (type.equals("Pattern")) { stream = new TilingPattern(library, streamHash, streamInputWrapper); } } if (stream == null && subtype != null) { // new form object if (subtype.equals("Image")) { stream = new ImageStream(library, streamHash, streamInputWrapper); } else if (subtype.equals("Form") && !"Pattern".equals(type)) { stream = new Form(library, streamHash, streamInputWrapper); } else if (subtype.equals("Form") && "Pattern".equals(type)) { stream = new TilingPattern(library, streamHash, streamInputWrapper); } } if (trailer != null) { stack.push(trailer); } else { // finally create a generic stream object which will be parsed // at a later time if (stream == null) { stream = new Stream(library, streamHash, streamInputWrapper); } stack.push(stream); // forcing a object return just encase the length is wrong // and we don't get to the endstream. return addPObject(library, objectReference); } } // end if (stream) // boolean objects are added to stack else if (nextToken.equals("true")) { stack.push(true); } else if (nextToken.equals("false")) { stack.push(false); } // Indirect Reference object found else if (nextToken.equals("R")) { // generationNumber number important for revisions Number generationNumber = (Number) (stack.pop()); Number objectNumber = (Number) (stack.pop()); stack.push(new Reference(objectNumber, generationNumber)); } else if (nextToken.equals("[")) { deepnessCount++; stack.push(nextToken); } // Found an array else if (nextToken.equals("]")) { deepnessCount--; final int searchPosition = stack.search("["); int size = searchPosition - 1; if (size < 0) { logger.warning("Negative array size, a malformed content " + "stream has likely been encountered."); size = 0; } List<Object> v = new ArrayList<Object>(size); Object[] tmp = new Object[size]; if (searchPosition > 0) { for (int i = size - 1; i >= 0; i--) { tmp[i] = stack.pop(); } // we need a mutable array so copy into an arrayList // so we can't use Arrays.asList(). for (int i = 0; i < size; i++) { v.add(tmp[i]); } stack.pop(); // "[" } else { stack.clear(); } stack.push(v); } else if (nextToken.equals("<<")) { deepnessCount++; stack.push(nextToken); } // Found a Dictionary else if (nextToken.equals(">>")) { deepnessCount--; // check for extra >> which we want to ignore if (!isTrailer && deepnessCount >= 0) { if (!stack.isEmpty()) { HashMap<Object, Object> hashMap = new HashMap<Object, Object>(); Object obj = stack.pop(); // put all of the dictionary definistion into the // the hashTabl while (!((obj instanceof String) && (obj.equals("<<"))) && !stack.isEmpty()) { Object key = stack.pop(); hashMap.put(key, obj); if (!stack.isEmpty()) { obj = stack.pop(); } else { break; } } obj = hashMap.get(Dictionary.TYPE_KEY); if (obj == null) { // PDF-927, incorrect /type def. obj = hashMap.get(new Name("type")); } // Process the know first level dictionaries. if (obj != null && obj instanceof Name) { Name n = (Name) obj; if (n.equals(Catalog.TYPE)) { stack.push(new Catalog(library, hashMap)); } else if (n.equals(PageTree.TYPE)) { stack.push(new PageTree(library, hashMap)); } else if (n.equals(Page.TYPE)) { stack.push(new Page(library, hashMap)); } else if (n.equals(Font.TYPE)) { // do a quick check to make sure we don't have a fontDescriptor // FontFile is specific to font descriptors. boolean fontDescriptor = hashMap.get(FontDescriptor.FONT_FILE) != null || hashMap.get(FontDescriptor.FONT_FILE_2) != null || hashMap.get(FontDescriptor.FONT_FILE_3) != null; if (!fontDescriptor) { stack.push(FontFactory.getInstance() .getFont(library, hashMap)); } else { stack.push(new FontDescriptor(library, hashMap)); } } else if (n.equals(FontDescriptor.TYPE)) { stack.push(new FontDescriptor(library, hashMap)); } else if (n.equals(CMap.TYPE)) { stack.push(hashMap); } else if (n.equals(Annotation.TYPE)) { stack.push(Annotation.buildAnnotation(library, hashMap)); } else if (n.equals(OptionalContentGroup.TYPE)) { stack.push(new OptionalContentGroup(library, hashMap)); } else if (n.equals(OptionalContentMembership.TYPE)) { stack.push(new OptionalContentMembership(library, hashMap)); } else stack.push(hashMap); } // everything else gets pushed onto the stack else { stack.push(hashMap); } } } else if (isTrailer && deepnessCount == 0) { // we have an xref entry HashMap<Object, Object> hashMap = new HashMap<Object, Object>(); Object obj = stack.pop(); // put all of the dictionary definition into the // the new map. while (!((obj instanceof String) && (obj.equals("<<"))) && !stack.isEmpty()) { Object key = stack.pop(); hashMap.put(key, obj); if (!stack.isEmpty()) { obj = stack.pop(); } else { break; } } return hashMap; } } // found traditional XrefTable found in all documents. else if (nextToken.equals("xref")) { // parse out hte traditional CrossReference xrefTable = new CrossReference(); xrefTable.addXRefTableEntries(this); stack.push(xrefTable); } else if (nextToken.equals("trailer")) { CrossReference xrefTable = null; if (stack.peek() instanceof CrossReference) xrefTable = (CrossReference) stack.pop(); stack.clear(); isTrailer = true; HashMap trailerDictionary = (HashMap) getObject(library); isTrailer = false; return new PTrailer(library, trailerDictionary, xrefTable, null); } // comments else if (nextToken instanceof String && ((String) nextToken).startsWith("%")) { // Comment, ignored for now } // corner case for encoder error "endobjxref" else if (nextToken instanceof String && ((String) nextToken).startsWith("endobj")) { if (inObject) { // set flag to false, as we are done parsing an Object inObject = false; // return PObject, return addPObject(library, objectReference); } } // everything else gets pushed onto the stack else { stack.push(nextToken); } if (parseMode == PARSE_MODE_OBJECT_STREAM && deepnessCount == 0 && stack.size() > 0) { return stack.pop(); } } while (!complete); } catch (Exception e) { logger.log(Level.WARNING, "Fatal error parsing PDF file stream.", e); return null; } // return the top of the stack return stack.pop(); } /** * */ public String peek2() throws IOException { reader.mark(2); char c[] = new char[2]; c[0] = (char) reader.read(); c[1] = (char) reader.read(); String s = new String(c); reader.reset(); return s; } /** * @return true if ate the ending EI delimiter * @throws java.io.IOException */ public boolean readLineForInlineImage(OutputStream out) throws IOException { // The encoder might not have put EI on its own line (as it should), // but might just put it right after the data final int STATE_PRE_E = 0; final int STATE_PRE_I = 1; final int STATE_PRE_WHITESPACE = 2; int state = STATE_PRE_E; while (true) { int c = reader.read(); if (c < 0) break; if (state == STATE_PRE_E && c == 'E') { state++; } else if (state == STATE_PRE_I && c == 'I') { state++; } else if (state == STATE_PRE_WHITESPACE && isWhitespace((char) (0xFF & c))) { // It's hard to tell if the EI + whitespace is part of the // image data or not, given that many PDFs are mis-encoded, // and don't give whitespace when necessary. So, instead of // assuming the need for whitespace, we're going to assume // that this is the real EI, and apply a heuristic to prove // ourselves wrong. boolean imageDataFound = isStillInlineImageData(reader, 32); if (imageDataFound) { out.write('E'); out.write('I'); out.write(c); state = STATE_PRE_E; if (c == '\r' || c == '\n') { break; } } else return true; } else { // If we got a fragment of the EI<whitespace> sequence, then we withheld // what we had so far. But if we're here, that fragment was incomplete, // so that was actual embedded data, and not the delimiter, so we have // to write it out. if (state > STATE_PRE_E) out.write('E'); if (state > STATE_PRE_I) out.write('I'); state = STATE_PRE_E; out.write((byte) c); if (c == '\r' || c == '\n') { break; } } } // If the input ends right after the EI, but with no whitespace, // then we're still done return state == STATE_PRE_WHITESPACE; } /** * We want to be conservative in deciding that we're still in the inline * image, since we haven't found any of these cases before now. */ private static boolean isStillInlineImageData( InputStream reader, int numBytesToCheck) throws IOException { boolean imageDataFound = false; boolean onlyWhitespaceSoFar = true; reader.mark(numBytesToCheck); byte[] toCheck = new byte[numBytesToCheck]; int numReadToCheck = reader.read(toCheck); for (int i = 0; i < numReadToCheck; i++) { char charToCheck = (char) (((int) toCheck[i]) & 0xFF); // If the very first thing we read is a Q or S token boolean typicalTextTokenInContentStream = (charToCheck == 'Q' || charToCheck == 'q' || charToCheck == 'S' || charToCheck == 's'); if (onlyWhitespaceSoFar && typicalTextTokenInContentStream && (i + 1 < numReadToCheck) && isWhitespace((char) (((int) toCheck[i + 1]) & 0xFF))) { break; } if (!isWhitespace(charToCheck)) onlyWhitespaceSoFar = false; // If we find some binary image data if (!isExpectedInContentStream(charToCheck)) { imageDataFound = true; break; } } reader.reset(); return imageDataFound; } /** * This is not necessarily an exhaustive list of characters one would * expect in a Content Stream, it's a heuristic for whether the data * might still be part of an inline image, or the lattercontent stream */ private static boolean isExpectedInContentStream(char c) { return ((c >= 'a' && c <= 'Z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || isWhitespace(c) || isDelimiter(c) || (c == '\\') || (c == '\'') || (c == '\"') || (c == '*') || (c == '.')); } /** * Utility Method for getting a PObject from the stack and adding it to the * library. The retrieved PObject has an ObjectReference added to it for * decryption purposes. * * @param library HashMap of all objects in document * @param objectReference PObjet indirect reference data * @return a valid PObject. */ public PObject addPObject(Library library, Reference objectReference) { Object o = stack.pop(); // Add the streams object reference which is needed for // decrypting encrypted streams if (o instanceof Stream) { Stream tmp = (Stream) o; tmp.setPObjectReference(objectReference); } // Add the dictionary object reference which is needed for // decrypting encrypted string contained in the dictionary else if (o instanceof Dictionary) { Dictionary tmp = (Dictionary) o; tmp.setPObjectReference(objectReference); } // the the object to the library library.addObject(o, objectReference); return new PObject(o, objectReference); } /** * Returns the next object found in a content stream. * * @return next object in the input stream * @throws java.io.IOException when the end of the <code>InputStream</code> * has been encountered. */ public Object getStreamObject() throws IOException { Object o = getToken(); if (o instanceof String) { if (o.equals("<<")) { HashMap<Object, Object> h = new HashMap<Object, Object>(); Object o1 = getStreamObject(); while (!o1.equals(">>")) { h.put(o1, getStreamObject()); o1 = getStreamObject(); } o = h; } // arrays are only used for CID mappings, the hex decoding is delayed // as a result using the CID_STREAM flag else if (o.equals("[")) { List<Object> v = new ArrayList<Object>(); Object o1 = getStreamObject(); while (!o1.equals("]")) { v.add(o1); o1 = getStreamObject(); } o = v; } } return o; } /** * Utility method used to parse a valid pdf token from an DataIinputStream. * Each call to this method return one pdf token. The Reader object is * used to "mark" the location of the last "read". * * @return the next token in the pdf data stream * @throws java.io.IOException if an I/O error occurs. */ public Object getToken() throws IOException { int currentByte; char currentChar; boolean inString = false; // currently parsing a string boolean hexString = false; boolean inNumber = false; lastTokenHString = false; // strip all white space characters do { currentByte = reader.read(); // input stream interrupted if (currentByte < 0) { throw new IOException(); } currentChar = (char) currentByte; } while (isWhitespace(currentChar)); /** * look the start of different primitive pdf objects * ( - strints * [ - arrays * % - comments * numbers. */ if (currentChar == '(') { // mark that we are currrently processing a string inString = true; } else if (currentChar == ']') { // fount end of an array return "]"; } else if (currentChar == '[') { // fount begining of an array return "["; } else if (currentChar == '%') { // ignore all the characters after a comment token until // we get to the end of the line StringBuilder stringBuffer = new StringBuilder(); do { stringBuffer.append(currentChar); currentByte = reader.read(); if (currentByte < 0) { // Final %%EOF might not have CR LF afterwards if (stringBuffer.length() > 0) return stringBuffer.toString(); throw new IOException(); } currentChar = (char) currentByte; } while (currentChar != 13 && currentChar != 10); // return all the text that is in the comment return stringBuffer.toString(); } else if ((currentChar >= '0' && currentChar <= '9') || currentChar == '-' || currentChar == '+' || currentChar == '.') { inNumber = true; } // mark this location in the input stream reader.mark(1); // read the next char from the reader char nextChar = (char) reader.read(); // Check for dictionaries, start '<<' and end '>>' if (currentChar == '>' && nextChar == '>') { return ">>"; } if (currentChar == '<') { // if two "<<" then we have a dictionary if (nextChar == '<') { return "<<"; } // Otherwise we have a hex number else { inString = true; hexString = true; } } // return to the previous mark reader.reset(); // store the parsed char in the token buffer. StringBuilder stringBuffer = new StringBuilder(); stringBuffer.append(currentChar); /** * Finally parse the contents of a complex token */ int parenthesisCount = 0; boolean complete = false; // indicates that the current char should be ignored and not added to // the current string. boolean ignoreChar = false; do { // while !complete // if we are not parsing a string mark the location if (!inString) { reader.mark(1); } // get the next byte and corresponding char currentByte = reader.read(); if (currentByte >= 0) { currentChar = (char) currentByte; } else { // if there are no more bytes (-1) then we must have reached the end of this token, // though maybe without appropriate termination of a string object. We'll just treat // them as if they were. break; } // if we are parsing a token that is a string, (...) if (inString) { if (hexString) { // found the end of a dictionary if (currentChar == '>') { stringBuffer.append(currentChar); break; } } else { // look for embedded strings if (currentChar == '(') { parenthesisCount++; } if (currentChar == ')') { if (parenthesisCount == 0) { stringBuffer.append(currentChar); break; } else { parenthesisCount--; } } // look for "\" character /** * The escape sequences can be as follows: * \n - line feed (LF) * \r - Carriage return (CR) * \t - Horizontal tab (HT) * \b - backspace (BS) * \f - form feed (FF) * \( - left parenthesis * \) - right parenthesis * \\ - backslash * \ddd - character code ddd (octal) * * Note: (\0053) denotes a string containing two characters, * \005 (Control-E) followed by the digit 3. */ if (currentChar == '\\') { // read next char currentChar = (char) reader.read(); // check for a digit, if so we have an octal // and we need to handle it correctly if (Character.isDigit(currentChar)) { // store the read digits StringBuilder digit = new StringBuilder(); digit.append(currentChar); // octals have a max size of 3 digits, we already // have one, so there can be up 2 more digits. for (int i = 0; i < 2; i++) { // mark the reader incase the next read is not // a digit. reader.mark(1); // read next char currentChar = (char) reader.read(); if (Character.isDigit(currentChar)) { digit.append(currentChar); } else { // back up the reader just incase // thre is only 1 or 2 digits in the octal reader.reset(); break; } } // finally convert digit to a character int charNumber = 0; try { charNumber = Integer.parseInt(digit.toString(), 8); } catch (NumberFormatException e) { logger.log(Level.FINE, "Integer parse error ", e); } // convert the interger from octal to dec. currentChar = (char) charNumber; } // do nothing else if (currentChar == '(' || currentChar == ')' || currentChar == '\\') { // do nothing } // capture the horizontal tab (HT), tab character is hard // to find, only appears in files with font substitution and // as a result we ahve better luck drawing a space character. else if (currentChar == 't') { currentChar = '\t'; } // capture the carriage return (CR) else if (currentChar == 'r') { currentChar = '\r'; } // capture the line feed (LF) else if (currentChar == 'n') { currentChar = '\n'; } // capture the backspace (BS) else if (currentChar == 'b') { currentChar = '\b'; } // capture the form feed (FF) else if (currentChar == 'f') { currentChar = '\f'; } // ignor CF, which indicate a '\' lone split line token else if (currentChar == 13) { ignoreChar = true; } // otherwise report the file format error else { if (logger.isLoggable(Level.FINE)) { logger.warning("C=" + ((int) currentChar)); } } } } } // if we are not in a string definition we want to break // and return the current token, as white spaces or other elements // would mean that we are on the next token else if (isWhitespace(currentChar)) { // we need to return the CR LR, as it is need by stream parsing if (currentByte == 13 || currentByte == 10) { reader.reset(); break; } // break on any whitespace else { // return stringBuffer.toString(); break; } } else if (isDelimiter(currentChar)) { // reset the reader so we start on this token on the next parse reader.reset(); break; } // append the current char and keep parsing if needed // IgnoreChar is set by the the line split char '\' if (!ignoreChar) { if (inString) { stringBuffer.append(currentChar); } // eat any junk characters else if (currentChar < 128) { stringBuffer.append(currentChar); } } // reset the ignorChar flag else { ignoreChar = false; } } while (!complete); /** * Return what we found */ // if a hex string decode it as needed if (hexString) { lastTokenHString = true; return new HexStringObject(stringBuffer); } // do a little clean up for any object that may have been missed.. // this mainly for the the document trailer information // a orphaned string if (inString) { return new LiteralStringObject(stringBuffer); } // return a new name else if (stringBuffer.charAt(0) == '/') { return new Name(stringBuffer.deleteCharAt(0)); } // if a number try and parse it else if (inNumber) { return getNumber(stringBuffer); } return stringBuffer.toString(); } public Object getNumberOrStringWithMark(int maxLength) throws IOException { reader.mark(maxLength); StringBuilder sb = new StringBuilder(maxLength); boolean readNonWhitespaceYet = false; boolean foundDigit = false; boolean foundDecimal = false; for (int i = 0; i < maxLength; i++) { int curr = reader.read(); if (curr < 0) break; char currChar = (char) curr; if (isWhitespace(currChar)) { if (readNonWhitespaceYet) break; } else if (isDelimiter(currChar)) { // Number or string has delimiter immediately after it, // which we'll have to unread. // Had hoped it would be whitespace, so wouldn't have to unread reader.reset(); reader.mark(maxLength); for (int j = 0; j < i; j++) { reader.read(); } readNonWhitespaceYet = true; break; } else { readNonWhitespaceYet = true; if (currChar == '.') foundDecimal = true; else if (currChar >= '0' && curr <= '9') foundDigit = true; sb.append(currChar); } } // Only bother trying to interpret as a number if contains a digit somewhere, // to reduce NumberFormatExceptions if (foundDigit) { return getNumber(sb); } if (sb.length() > 0) return sb.toString(); return null; } public void ungetNumberOrStringWithReset() throws IOException { reader.reset(); } public int getIntSurroundedByWhitespace() { int num = 0; boolean makeNegative = false; boolean readNonWhitespace = false; try { while (true) { int curr = reader.read(); if (curr < 0) break; if (Character.isWhitespace((char) curr)) { if (readNonWhitespace) break; } else if (curr == '-') { makeNegative = true; readNonWhitespace = true; } else if (curr >= '0' && curr <= '9') { num *= 10; num += (curr - '0'); readNonWhitespace = true; } else { // break as we've hit a none digit and should bail break; } } } catch (IOException e) { logger.log(Level.FINE, "Error detecting int.", e); } if (makeNegative) num = num * -1; return num; } public Number getNumber(StringBuilder value) { int digit = 0; float decimal = 0; float divisor = 10; boolean isDigit; boolean isDecimal = false; byte[] streamBytes = value.toString().getBytes(); int startTokenPos = 0; boolean singed = streamBytes[startTokenPos] == '-'; boolean positive = streamBytes[startTokenPos] == '+'; startTokenPos = singed || positive ? startTokenPos + 1 : startTokenPos; // check for double sign, thanks oracle forms! if (singed && streamBytes[startTokenPos] == '-') { startTokenPos++; } int current; for (int i = startTokenPos, max = streamBytes.length; i < max; i++) { current = streamBytes[i] - 48; isDigit = streamBytes[i] >= 48 && streamBytes[i] <= 57; if (!isDecimal && isDigit) { digit = (digit * 10) + current; } else if (isDecimal && isDigit) { decimal += (current / divisor); divisor *= 10; } else if (streamBytes[i] == 46) { isDecimal = true; } else { // anything else we can assume malformed and should break. break; } } if (singed) { if (isDecimal) { return -(digit + decimal); } else { return -digit; } } else { if (isDecimal) { return digit + decimal; } else { return digit; } } } public long getLongSurroundedByWhitespace() { long num = 0L; boolean makeNegative = false; boolean readNonWhitespace = false; try { while (true) { int curr = reader.read(); if (curr < 0) break; if (Character.isWhitespace((char) curr)) { if (readNonWhitespace) break; } else if (curr == '-') { makeNegative = true; readNonWhitespace = true; } else if (curr >= '0' && curr <= '9') { num *= 10L; num += ((long) (curr - '0')); readNonWhitespace = true; } else { break; } } } catch (IOException e) { logger.log(Level.FINER, "Error detecting long.", e); } if (makeNegative) num = num * -1L; return num; } public int getLinearTraversalOffset() { return linearTraversalOffset; } public char getCharSurroundedByWhitespace() { char alpha = 0; try { while (true) { int curr = reader.read(); if (curr < 0) break; char c = (char) curr; if (!Character.isWhitespace(c)) { alpha = c; break; } } } catch (IOException e) { logger.log(Level.FINE, "Error detecting char.", e); } return alpha; } /** * White space characters defined by ' ', '\t', '\r', '\n', '\f' * * @param c true if character is white space */ public static boolean isWhitespace(char c) { return ((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n') || (c == '\f') || (c == 0)); } private static boolean isDelimiter(char c) { return ((c == '[') || (c == ']') || (c == '(') || (c == ')') || (c == '<') || (c == '>') || (c == '{') || (c == '}') || (c == '/') || (c == '%')); } private long captureStreamData(OutputStream out) throws IOException { long numBytes = 0; while (true) { // read bytes int nextByte = reader.read(); // look to see if we have the ending tag if (nextByte == 'e') { reader.mark(10); if (reader.read() == 'n' && reader.read() == 'd' && reader.read() == 's' && reader.read() == 't' && reader.read() == 'r' && reader.read() == 'e' && reader.read() == 'a' && reader.read() == 'm') { break; } else { reader.reset(); } } else if (nextByte < 0) break; // write the bytes if (out != null) out.write(nextByte); numBytes++; } return numBytes; } private long skipUntilEndstream(OutputStream out) throws IOException { long skipped = 0L; while (true) { reader.mark(10); // read bytes int nextByte = reader.read(); if (nextByte == 'e' && reader.read() == 'n' && reader.read() == 'd' && reader.read() == 's' && reader.read() == 't' && reader.read() == 'r' && reader.read() == 'e' && reader.read() == 'a' && reader.read() == 'm') { reader.reset(); break; } else if (nextByte < 0) break; else { if (nextByte == 0x0A || nextByte == 0x0D || nextByte == 0x20) continue; if (out != null) out.write(nextByte); } skipped++; } return skipped; } private float parseNumber(StringBuilder stringBuilder) { float digit = 0; float divisor = 10; boolean isDigit; boolean isDecimal = false; int startTokenPos = 0; int length = stringBuilder.length(); char[] streamBytes = new char[length]; stringBuilder.getChars(0, length, streamBytes, 0); boolean singed = streamBytes[startTokenPos] == '-'; startTokenPos = singed ? startTokenPos + 1 : startTokenPos; int current; for (int i = startTokenPos; i < length; i++) { current = streamBytes[i] - 48; isDigit = streamBytes[i] >= 48 && streamBytes[i] <= 57; if (!isDecimal && isDigit) { digit = (digit * 10) + current; } else if (isDecimal && isDigit) { digit += (current / divisor); divisor *= 10; } else if (streamBytes[i] == 46) { isDecimal = true; } else { // anything else we can assume malformed and should break. break; } } if (singed) { return -digit; } else { return digit; } } }