/*
* Copyright 2006-2012 ICEsoft Technologies Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an "AS
* IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.icepdf.core.util;
import org.icepdf.core.exceptions.PDFException;
import org.icepdf.core.io.ConservativeSizingByteArrayOutputStream;
import org.icepdf.core.io.SeekableByteArrayInputStream;
import org.icepdf.core.io.SeekableInput;
import org.icepdf.core.io.SeekableInputConstrainedWrapper;
import org.icepdf.core.pobjects.*;
import org.icepdf.core.pobjects.annotations.Annotation;
import org.icepdf.core.pobjects.fonts.FontDescriptor;
import org.icepdf.core.pobjects.fonts.FontFactory;
import org.icepdf.core.pobjects.graphics.TilingPattern;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Hashtable;
import java.util.Stack;
import java.util.Vector;
import java.util.logging.Logger;
import java.util.logging.Level;
/**
* put your documentation comment here
*/
public class Parser {
private static final Logger logger =
Logger.getLogger(Parser.class.toString());
public static final int PARSE_MODE_NORMAL = 0;
public static final int PARSE_MODE_OBJECT_STREAM = 1;
// InputStream has to support mark(), reset(), and markSupported()
// DO NOT close this, since we have two cases: read everything up front, and progressive reads
private InputStream reader;
boolean lastTokenHString = false;
private Stack<Object> stack = new Stack<Object>();
private int parseMode;
public Parser(SeekableInput r) {
this(r, PARSE_MODE_NORMAL);
}
public Parser(SeekableInput r, int pm) {
reader = r.getInputStream();
parseMode = pm;
}
public Parser(InputStream r) {
this(r, PARSE_MODE_NORMAL);
}
public Parser(InputStream r, int pm) {
reader = new BufferedInputStream(r);
parseMode = pm;
}
/**
* Get an object from the pdf input DataInputStream.
*
* @param library all found objects in the pdf document
* @return the next object in the DataInputStream. Null is returned
* if there are no more objects left in the DataInputStream or
* a I/O error is encountered.
* @throws PDFException error getting object from library
*/
public Object getObject(Library library) throws PDFException {
int deepnessCount = 0;
boolean inObject = false; // currently parsing tokens in an object
boolean complete = false; // flag used for do loop.
Object nextToken;
Reference objectReference = null;
try {
do { //while (!complete);
// keep track of currently parsed objects reference
// get the next token inside the object stream
try {
nextToken = getToken();
//System.out.println("Parser.getObject() nextToken: " + nextToken);
// commented out for performance reasons
//Thread.yield();
}
catch (IOException e) {
// eat it as it is what is expected
// if (Debug.ex){
// Debug.ex(e);
// }
return null;
}
// check for specific primative object types returned by getToken()
if (nextToken instanceof StringObject
|| nextToken instanceof Name
|| nextToken instanceof Number) {
// Very Important, store the PDF object reference information,
// as it is needed when to decrypt an encrypted string.
if (nextToken instanceof StringObject) {
StringObject tmp = (StringObject) nextToken;
tmp.setReference(objectReference);
}
stack.push(nextToken);
}
// mark that we have entered a object declaration
else if (nextToken.equals("obj")) {
// Since we can return objects on "endstream", then we can
// leave straggling "endobj", which would deepnessCount--,
// even though they're done in a separate method invocation
// Hence, "obj" does /deepnessCount = 1/ instead of /deepnessCount++/
deepnessCount = 1;
inObject = true;
Number generationNumber = (Number) (stack.pop());
Number objectNumber = (Number) (stack.pop());
objectReference = new Reference(objectNumber,
generationNumber);
}
// mark that we have reached the end of the object
else if (nextToken.equals("endobj")) {
deepnessCount--;
//System.out.println("Parser.getObject() endobj objectReference: " + objectReference + " deepnessCount: " + deepnessCount);
if (inObject) {
// set flag to false, as we are done parsing an Object
inObject = false;
// return PObject,
return addPObject(library, objectReference);
// else, we ignore as the endStream token also returns a
// PObject.
} else
return null;
}
// found endstream object, we will return the PObject containing
// the stream as there can be no further tokens. This addresses
// an incorrect a syntax error with OpenOffice document where
// the endobj tag is missing on some Stream objects.
else if (nextToken.equals("endstream")) {
deepnessCount--;
// do nothing, but don't add it to the stack
if (inObject) {
inObject = false;
// return PObject,
return addPObject(library, objectReference);
}
}
// found a stream object, streams are allways defined inside
// of a object so we will always have a dictionary (hash) that
// has the length and filter definitions in it
else if (nextToken.equals("stream")) {
//System.out.println("Parser.getObject() stream");
deepnessCount++;
// pop dictionary that defines the stream
Hashtable streamHash = (Hashtable) stack.pop();
//System.out.println("Parser.getObject() stream streamHash: " + streamHash);
// find the length of the stream
int streamLength = library.getInt(streamHash, "Length");
//System.out.println("Parser.getObject() stream streamLength: " + streamLength);
SeekableInputConstrainedWrapper streamInputWrapper;
try {
// a stream token's end of line marker can be either:
// - a carriage return and a line feed
// - just a line feed, and not by a carriage return alone.
/*
reader.mark(5);
byte[] charBuffer = new byte[5];
reader.read(charBuffer);
System.out.println("looking at " + objectReference + " " + streamHash);
System.out.println("Stream bytes " + charBuffer[0] + " " + charBuffer[1] + " " + charBuffer[2] + " " + charBuffer[3] + " " + charBuffer[4]);
reader.reset();
*/
// check for carage return and line feed, but reset if
// just a carriage return as it is a valid stream byte
reader.mark(2);
// alway eat a 13,against the spec but we have several examples of this.
int curChar = reader.read();
if (curChar == 13) {
reader.mark(1);
if (reader.read() != 10) {
reader.reset();
}
}
// always eat a 10
else if (curChar == 10) {
// eat the stream character
}
// reset the rest
else {
reader.reset();
}
/*
reader.mark(5);
charBuffer = new byte[5];
reader.read(charBuffer);
System.out.println("Stream bytes " + charBuffer[0] + " " + charBuffer[1] + " " + charBuffer[2] +" " + charBuffer[3] + " " + charBuffer[4]);
reader.reset();
*/
if (reader instanceof SeekableInput) {
SeekableInput streamDataInput = (SeekableInput) reader;
long filePositionOfStreamData = streamDataInput.getAbsolutePosition();
long lengthOfStreamData;
// If the stream has a length that we can currently use
// such as a R that has been parsed or an integer
if (streamLength > 0) {
lengthOfStreamData = streamLength;
streamDataInput.seekRelative(streamLength);
// Read any extraneous data coming after the length, but before endstream
// long skipped = skipUntilEndstream( null );
lengthOfStreamData += skipUntilEndstream(null);
} else {
lengthOfStreamData = captureStreamData(null);
}
streamInputWrapper = new SeekableInputConstrainedWrapper(
streamDataInput, filePositionOfStreamData, lengthOfStreamData, false);
} else { // reader is just regular InputStream (BufferedInputStream)
//System.out.println("Parser.getObject() stream NOT SeekableInput");
ConservativeSizingByteArrayOutputStream out;
// If the stream in from a regular InputStream,
// then the PDF was probably linearly traversed,
// in which case it doesn't matter if they have
// specified the stream length, because we can't
// trust that anyway
//System.out.println("Parser.getObject() stream NOT SeekableInput linear traversal: " + library.isLinearTraversal());
if (!library.isLinearTraversal() && streamLength > 0) {
byte[] buffer = new byte[streamLength];
int totalRead = 0;
while (totalRead < buffer.length) {
int currRead = reader.read(buffer, totalRead, buffer.length - totalRead);
//System.out.println("Parser.getObject() stream NOT SeekableInput currRead: " + currRead);
//String s = new String(buffer, totalRead, currRead);
//System.out.println(s);
if (currRead <= 0)
break;
totalRead += currRead;
//System.out.println("Parser.getObject() stream NOT SeekableInput totalRead: " + totalRead);
}
out = new ConservativeSizingByteArrayOutputStream(
buffer, library.memoryManager);
// Read any extraneous data coming after the length, but before endstream
// long skipped = skipUntilEndstream( out );
skipUntilEndstream(out);
}
// if stream doesn't have a length, read the stream
// until end stream has been found
else {
//System.out.println("Parser.getObject() stream NOT SeekableInput No trusted streamLength");
out = new ConservativeSizingByteArrayOutputStream(
16 * 1024, library.memoryManager);
captureStreamData(out);
}
int size = out.size();
out.trim();
byte[] buffer = out.relinquishByteArray();
SeekableInput streamDataInput = new SeekableByteArrayInputStream(buffer);
long filePositionOfStreamData = 0L;
long lengthOfStreamData = size;
streamInputWrapper = new SeekableInputConstrainedWrapper(
streamDataInput, filePositionOfStreamData, lengthOfStreamData, true);
}
}
catch (IOException e) {
return null;
}
PTrailer trailer = null;
// set the stream know objects if possible
Stream stream = null;
//Hashtable streamHash1 = (Hashtable) stack.pop();
Name type = (Name) library.getObject(streamHash, "Type");
Name subtype = (Name) library.getObject(streamHash, "Subtype");
if (type != null) {
// new Tiling Pattern Object, will have a stream.
if (type.equals("Pattern")) {
stream = new TilingPattern(library, streamHash, streamInputWrapper);
} else if (type.equals("XRef")) {
stream = new Stream(library, streamHash, streamInputWrapper);
stream.init();
InputStream in = stream.getInputStreamForDecodedStreamBytes();
CrossReference xrefStream = new CrossReference();
if (in != null) {
try {
xrefStream.addXRefStreamEntries(library, streamHash, in);
}
finally {
try {
in.close();
}
catch (IOException e) {
logger.log(Level.FINE, "Error appending stream entries.", e);
}
}
}
stream.dispose(false);
// XRef dict is both Trailer dict and XRef stream dict.
// PTrailer alters its dict, so copy it to keep everything sane
Hashtable trailerHash = (Hashtable) streamHash.clone();
trailer = new PTrailer(library, trailerHash, null, xrefStream);
} else if (type.equals("ObjStm")) {
stream = new ObjectStream(library, streamHash, streamInputWrapper);
}
}
if (subtype != null) {
// new form object
if (subtype.equals("Form") && !"pattern".equals(type)) {
stream = new Form(library, streamHash, streamInputWrapper);
}
}
if (trailer != null) {
stack.push(trailer);
} else {
// finally create a generic stream object which will be parsed
// at a later time
if (stream == null) {
stream = new Stream(library, streamHash, streamInputWrapper);
}
stack.push(stream);
}
}
// end if (stream)
// boolean objects are added to stack
else if (nextToken.equals("true")) {
stack.push(new Boolean(true));
} else if (nextToken.equals("false")) {
stack.push(new Boolean(false));
}
// Indirect Reference object found
else if (nextToken.equals("R")) {
// generationNumber number important for revisions
Number generationNumber = (Number) (stack.pop());
Number objectNumber = (Number) (stack.pop());
stack.push(new Reference(objectNumber,
generationNumber));
} else if (nextToken.equals("[")) {
deepnessCount++;
stack.push(nextToken);
}
// Found an array
else if (nextToken.equals("]")) {
deepnessCount--;
final int searchPosition = stack.search("[");
final int size = searchPosition - 1;
Vector v = new Vector(size > 0 ? size : 1);
if (size > 0)
v.setSize(size);
if (searchPosition > 0) {
for (int i = size-1; i >= 0; i--) {
Object obj = stack.pop();
v.set(i, obj);
}
stack.pop(); // "["
}
else {
stack.clear();
}
stack.push(v);
} else if (nextToken.equals("<<")) {
//System.out.println("Parser.getObject() << deepnessCount: " + deepnessCount + " -> " + (deepnessCount+1));
deepnessCount++;
stack.push(nextToken);
}
// Found a Dictionary
else if (nextToken.equals(">>")) {
//System.out.println("Parser.getObject() >> deepnessCount: " + deepnessCount + " -> " + (deepnessCount-1));
deepnessCount--;
Hashtable hashTable = new Hashtable();
//System.out.println("Parser.getObject() >> stack.empty: " + stack.isEmpty());
if (!stack.isEmpty()) {
Object obj = stack.pop();
// put all of the dictionary definistion into the
// the hashTabl
while (!((obj instanceof String)
&& (obj.equals("<<"))) && !stack.isEmpty()) {
Object key = stack.pop();
//System.out.println("Parser.getObject() >> key: " + key);
//System.out.println("Parser.getObject() >> value: " + obj);
hashTable.put(key, obj);
if (!stack.isEmpty()) {
obj = stack.pop();
} else {
break;
}
}
obj = hashTable.get("Type");
//System.out.println("Parser.getObject() >> Type: " + obj);
// Process the know first level dictionaries.
if (obj != null && obj instanceof Name) {
Name n = (Name) obj;
//System.out.println("Parser.getObject() >> Name: " + n);
if (n.equals("Catalog")) {
stack.push(new Catalog(library, hashTable));
} else if (n.equals("Pages")) {
stack.push(new PageTree(library, hashTable));
} else if (n.equals("Page")) {
stack.push(new Page(library, hashTable));
} else if (n.equals("Font")) {
stack.push(FontFactory.getInstance()
.getFont(library, hashTable));
} else if (n.equals("FontDescriptor")) {
stack.push(new FontDescriptor(library, hashTable));
} else if (n.equals("CMap")) {
stack.push(hashTable);
} else if (n.equals("Annot")) {
stack.push(Annotation.buildAnnotation(library, hashTable));
} else
stack.push(hashTable);
}
// everything else gets pushed onto the stack
else {
//System.out.println("Parser.getObject() >> Not Name");
stack.push(hashTable);
}
//System.out.println("Parser.getObject() >> deepnessCount: " + deepnessCount);
if (deepnessCount == 0)
return stack.pop();
}
}
// end of if >> (dictionary
// // read encryp information
// if (startxrefDictionary.containsKey("Encrypt")) {
//
// // read ID information needed for encryption
// Vector fileID = null;
// if (startxrefDictionary.containsKey("ID")){
// // get the files identifier vector
// fileID = (Vector)startxrefDictionary.get("ID");
// }
//
// // Try and find encrypt dictionary
// Object encrypt = startxrefDictionary.get("Encrypt");
// System.out.println(encrypt.getClass());
// if (encrypt instanceof Reference ){
// Reference encryptReference = (Reference)encrypt;
// SecurityManager securityManager =
// new SecurityManager (library,
// encryptReference,
// fileID);
// }
// else if (encrypt instanceof Dictionary){
//
//
// }
//
// // initiate the security manager.
// //org.icepdf.core.pobjects.security.SecurityManager.getInstance();
// }
else if (nextToken.equals("xref")) {
//System.out.println("xref found");
CrossReference xrefTable = new CrossReference();
xrefTable.addXRefTableEntries(this);
stack.push(xrefTable);
} else if (nextToken.equals("trailer")) {
CrossReference xrefTable = null;
if (stack.peek() instanceof CrossReference)
xrefTable = (CrossReference) stack.pop();
stack.clear();
Hashtable trailerDictionary = (Hashtable) getObject(library);
//System.out.println("trailer");
//System.out.println(" trailerDictionary: " + trailerDictionary);
//System.out.println(" xref table: " + xrefTable);
return new PTrailer(library, trailerDictionary, xrefTable, null);
}
// comments
else if (nextToken instanceof String &&
((String) nextToken).startsWith("%")) {
// Comment, ignored for now
}
// everything else gets pushed onto the stack
else {
stack.push(nextToken);
}
if (parseMode == PARSE_MODE_OBJECT_STREAM && deepnessCount == 0 && stack.size() > 0) {
return stack.pop();
}
}
while (!complete);
}
// catch (PDFSecurityException e) {
// throw e;
// }
catch (Exception e) {
logger.log(Level.FINE, "Fatal error parsing PDF file stream.", e);
return null;
}
// return the top of the statck
return stack.pop();
}
/**
* Utility Method for getting a PObject from the stack and adding it to the
* library. The retrieved PObject has an ObjectReference added to it for
* decryption purposes.
*
* @param library hashtable of all objects in document
* @param objectReference PObjet indirect reference data
* @return a valid PObject.
*/
public PObject addPObject(Library library, Reference objectReference) {
Object o = stack.pop();
// Add the streams object reference which is needed for
// decrypting encrypted streams
if (o instanceof Stream) {
Stream tmp = (Stream) o;
tmp.setPObjectReference(objectReference);
}
// Add the dictionary object reference which is needed for
// decrypting encrypted string contained in the dictionary
else if (o instanceof Dictionary) {
Dictionary tmp = (Dictionary) o;
tmp.setPObjectReference(objectReference);
}
// the the object to the library
library.addObject(o, objectReference);
return new PObject(o, objectReference);
}
/**
* Returns the next object found in a content stream.
*
* @return next object in the input stream
* @throws java.io.IOException when the end of the <code>InputStream</code>
* has been encountered.
*/
public Object getStreamObject() throws IOException {
Object o = getToken();
if (o instanceof String) {
if (o.equals("<<")) {
Hashtable h = new Hashtable();
Object o1 = getStreamObject();
while (!o1.equals(">>")) {
h.put(o1, getStreamObject());
o1 = getStreamObject();
}
o = h;
}
// arrays are only used for CID mappings, the hex decoding is delayed
// as a result using the CID_STREAM flag
else if (o.equals("[")) {
Vector v = new Vector();
Object o1 = getStreamObject();
while (!o1.equals("]")) {
v.addElement(o1);
o1 = getStreamObject();
}
v.trimToSize();
o = v;
}
}
//System.err.println("GET=" + o + " - " + o.getClass().getName());
return o;
}
/**
* Utility method used to parse a valid pdf token from an DataIinputStream.
* Each call to this method return one pdf token. The Reader object is
* used to "mark" the location of the last "read".
*
* @return the next token in the pdf data stream
* @throws java.io.IOException if an I/O error occurs.
*/
public Object getToken() throws IOException {
int currentByte;
char currentChar;
boolean inString = false; // currently parsing a string
boolean hexString = false;
lastTokenHString = false;
// strip all white space characters
do {
currentByte = reader.read();
// input stream interupted
if (currentByte < 0) {
throw new IOException();
}
currentChar = (char) currentByte;
}
while (isWhitespace(currentChar));
/**
* look the start of different primative pdf objects
* ( - strints
* [ - arrays
* % - comments
*/
if (currentChar == '(') {
// mark that we are currrently processing a string
inString = true;
} else if (currentChar == ']') {
// fount end of an array
return "]";
} else if (currentChar == '[') {
// fount begining of an array
return "[";
} else if (currentChar == '%') {
// ignore all the characters after a comment token until
// we get to the end of the line
StringBuilder stringBuffer = new StringBuilder();
do {
stringBuffer.append(currentChar);
currentByte = reader.read();
if (currentByte < 0) {
// Final %%EOF might not have CR LF afterwards
if (stringBuffer.length() > 0)
return stringBuffer.toString();
throw new IOException();
}
currentChar = (char) currentByte;
}
while (currentChar != 13 && currentChar != 10);
// return all the text that is in the comment
return stringBuffer.toString();
}
// mark this location in the input stream
reader.mark(1);
// read the next char from the reader
char nextChar = (char) reader.read();
// Check for dictionaries, start '<<' and end '>>'
if (currentChar == '>' && nextChar == '>') {
return ">>";
}
if (currentChar == '<') {
// if two "<<" then we have a dictionary
if (nextChar == '<') {
return "<<";
}
// Otherwise we have a hex number
else {
inString = true;
hexString = true;
}
}
// return to the previous mark
reader.reset();
// store the parsed char in the token buffer.
StringBuilder stringBuffer = new StringBuilder();
stringBuffer.append(currentChar);
/**
* Finally parse the contents of a complex token
*/
int parenthesisCount = 0;
boolean complete = false;
// indicates that the current char should be ignored and not added to
// the current string.
boolean ignoreChar = false;
do { // while !complete
// if we are not parsing a string mark the location
if (!inString) {
reader.mark(1);
}
// PDF-215, try to sniff out missing space between tokens and numbers
// in a content stream. The fix only addressed a character followed
// by a number. It's legal for a /Name object to have mixed content
// so we need to check for / at the start of the string.
if ( !(inString || hexString) && currentChar != 'd' &&
currentChar > 65 && (nextChar >=48 && nextChar <= 57) &&
stringBuffer.charAt(0) != '/'){
reader.reset();
break;
}
// get the next byte and corresponding char
currentByte = reader.read();
// if ther are no more bytes (-1) then we should return previous
// stringBuffer value, otherwise the last grouping of tokens will
// be ignored, which is very bad.
if (currentByte >= 0) {
currentChar = (char) currentByte;
} else {
return stringBuffer.toString();
}
// if we are parsing a token that is a string, (...)
if (inString) {
if (hexString) {
// found the end of a dictionary
if (currentChar == '>') {
complete = true;
stringBuffer.append(currentChar);
break;
}
} else {
// look for embedded strings
if (currentChar == '(') {
parenthesisCount++;
}
if (currentChar == ')') {
if (parenthesisCount == 0) {
complete = true;
stringBuffer.append(currentChar);
break;
} else {
parenthesisCount--;
}
}
// look for "\" character
/**
* The escape sequences can be as follows:
* \n - line feed (LF)
* \r - Carriage return (CR)
* \t - Horizontal tab (HT)
* \b - backspace (BS)
* \f - form feed (FF)
* \( - left parenthesis
* \) - right parenthesis
* \\ - backslash
* \ddd - character code ddd (octal)
*
* Note: (\0053) denotes a string containing two characters,
* \005 (Control-E) followed by the digit 3.
*/
if (currentChar == '\\') {
// read next char
currentChar = (char) reader.read();
// check for a digit, if so we have an octal
// and we need to handle it correctly
if (Character.isDigit(currentChar)) {
// store the read digits
StringBuilder digit = new StringBuilder();
digit.append(currentChar);
// octals have a max size of 3 digits, we already
// have one, so there can be up 2 more digits.
for (int i = 0; i < 2; i++) {
// mark the reader incase the next read is not
// a digit.
reader.mark(1);
// read next char
currentChar = (char) reader.read();
if (Character.isDigit(currentChar)) {
digit.append(currentChar);
} else {
// back up the reader just incase
// thre is only 1 or 2 digits in the octal
reader.reset();
break;
}
}
// finally convert digit to a character
int charNumber = 0;
try {
charNumber = Integer.parseInt(digit.toString(), 8);
}
catch (NumberFormatException e) {
logger.log(Level.FINE, "Integer parse error ", e);
}
// convert the interger from octal to dec.
currentChar = (char) charNumber;
}
// do nothing
else if (currentChar == '(' || currentChar == ')'
|| currentChar == '\\') {
}
// capture the horizontal tab (HT), tab character is hard
// to find, only appears in files with font substitution and
// as a result we ahve better luck drawing a space character.
else if (currentChar == 't') {
currentChar = '\t';
}
// capture the carriage return (CR)
else if (currentChar == 'r') {
currentChar = '\r';
}
// capture the line feed (LF)
else if (currentChar == 'n') {
currentChar = '\n';
}
// capture the backspace (BS)
else if (currentChar == 'b') {
currentChar = '\b';
}
// capture the form feed (FF)
else if (currentChar == 'f') {
currentChar = '\f';
}
// ignor CF, which indicate a '\' lone split line token
else if (currentChar == 13) {
ignoreChar = true;
}
// otherwise report the file format error
else {
if (logger.isLoggable(Level.FINE)) {
logger.warning("C=" + ((int) currentChar));
}
}
}
}
}
// if we are not in a string definition we want to break
// and return the current token, as white spaces or other elements
// would mean that we are on the next token
else if (isWhitespace(currentChar)) {
// return stringBuffer.toString();
// we need to return the CR LR, as it is need by stream parsing
if (currentByte == 13 || currentByte == 10) {
reader.reset();
break;
}
// break on any whitespace
else {
// return stringBuffer.toString();
break;
}
} else if (isDelimiter(currentChar)) {
// reset the reader so we start on this token on the next parse
reader.reset();
break;
}
// append the current char and keep parsing if needed
// IgnoreChar is set by the the line split char '\'
if (!ignoreChar) {
stringBuffer.append(currentChar);
}
// reset the ignorChar flag
else {
ignoreChar = false;
}
}
while (!complete);
/**
* Return what we found
*/
// if a hex string decode it as needed
if (hexString) {
lastTokenHString = true;
return new HexStringObject(stringBuffer);
}
// do a little clean up for any object that may have been missed..
// this mainly for the the document trailer information
// a orphaned string
if (inString) {
return new LiteralStringObject(stringBuffer);
}
// return a new name
else if (stringBuffer.charAt(0) == '/') {
return new Name(stringBuffer.deleteCharAt(0));
}
// if a number try and parse it
else {
boolean foundDigit = false;
boolean foundDecimal = false;
for (int i = stringBuffer.length() - 1; i >= 0; i--) {
char curr = stringBuffer.charAt(i);
if (curr == '.')
foundDecimal = true;
else if (curr >= '0' && curr <= '9')
foundDigit = true;
}
// Only bother trying to interpret as a number if contains a digit somewhere,
// to reduce NumberFormatExceptions
if (foundDigit) {
try {
if (foundDecimal)
return Float.valueOf(stringBuffer.toString());
else {
return Integer.valueOf(stringBuffer.toString());
}
}
catch (NumberFormatException ex) {
// Debug.trace("Number format exception " + ex);
}
}
}
return stringBuffer.toString();
}
public Object getNumberOrStringWithMark(int maxLength) throws IOException {
reader.mark(maxLength);
StringBuilder sb = new StringBuilder(maxLength);
boolean readNonWhitespaceYet = false;
boolean foundDigit = false;
boolean foundDecimal = false;
for (int i = 0; i < maxLength; i++) {
int curr = reader.read();
if (curr < 0)
break;
char currChar = (char) curr;
if (isWhitespace(currChar)) {
if (readNonWhitespaceYet)
break;
} else if (isDelimiter(currChar)) {
// Number or string has delimiter immediately after it,
// which we'll have to unread.
// Had hoped it would be whitespace, so wouldn't have to unread
reader.reset();
reader.mark(maxLength);
for (int j = 0; j < i; j++)
reader.read();
readNonWhitespaceYet = true;
break;
} else {
readNonWhitespaceYet = true;
if (currChar == '.')
foundDecimal = true;
else if (currChar >= '0' && curr <= '9')
foundDigit = true;
sb.append(currChar);
}
}
// Only bother trying to interpret as a number if contains a digit somewhere,
// to reduce NumberFormatExceptions
if (foundDigit) {
try {
if (foundDecimal)
return Float.valueOf(sb.toString());
else {
return Integer.valueOf(sb.toString());
}
}
catch (NumberFormatException ex) {
// Debug.trace("Number format exception " + ex);
}
}
if (sb.length() > 0)
return sb.toString();
return null;
}
public void ungetNumberOrStringWithReset() throws IOException {
reader.reset();
}
public int getIntSurroundedByWhitespace() {
int num = 0;
boolean makeNegative = false;
boolean readNonWhitespace = false;
try {
while (true) {
int curr = reader.read();
if (curr < 0)
break;
if (Character.isWhitespace((char) curr)) {
if (readNonWhitespace)
break;
} else if (curr == '-') {
makeNegative = true;
readNonWhitespace = true;
} else if (curr >= '0' && curr <= '9') {
num *= 10;
num += (curr - '0');
readNonWhitespace = true;
}
}
}
catch (IOException e) {
logger.log(Level.FINE, "Error detecting int.", e);
}
if (makeNegative)
num = num * -1;
return num;
}
public long getLongSurroundedByWhitespace() {
long num = 0L;
boolean makeNegative = false;
boolean readNonWhitespace = false;
try {
while (true) {
int curr = reader.read();
if (curr < 0)
break;
if (Character.isWhitespace((char) curr)) {
if (readNonWhitespace)
break;
} else if (curr == '-') {
makeNegative = true;
readNonWhitespace = true;
} else if (curr >= '0' && curr <= '9') {
num *= 10L;
num += ((long) (curr - '0'));
readNonWhitespace = true;
}
}
}
catch (IOException e) {
logger.log(Level.FINE, "Error detecting long.", e);
}
if (makeNegative)
num = num * -1L;
return num;
}
public char getCharSurroundedByWhitespace() {
char alpha = 0;
try {
while (true) {
int curr = reader.read();
if (curr < 0)
break;
char c = (char) curr;
if (!Character.isWhitespace(c)) {
alpha = c;
break;
}
}
}
catch (IOException e) {
logger.log(Level.FINE, "Error detecting char.", e);
}
return alpha;
}
int hexToInt(String hex) {
hex = hex.substring(1, hex.length() - 1).toUpperCase();
return Integer.parseInt(hex, 16 /* radix */);
}
/**
* @param hh
*/
String hexToString(String hh) {
hh = hh.substring(1, hh.length() - 1).toUpperCase();
StringBuilder sb = new StringBuilder();
if (hh.charAt(0) == 'F'
&& hh.charAt(1) == 'E'
&& hh.charAt(2) == 'F'
&& hh.charAt(3) == 'F') {
byte b[] = new byte[4];
for (int i = 1; i < hh.length() / 4; i++) {
b[0] = (byte) hh.charAt(i * 4);
b[1] = (byte) hh.charAt(i * 4 + 1);
b[2] = (byte) hh.charAt(i * 4 + 2);
b[3] = (byte) hh.charAt(i * 4 + 3);
sb.append((char) Integer.parseInt(new String(b), 16));
}
} else {
byte b[] = new byte[2];
for (int i = 0; i < hh.length() / 2; i++) {
try {
b[0] = (byte) hh.charAt(i * 2);
b[1] = (byte) hh.charAt(i * 2 + 1);
sb.append((char) Short.parseShort(new String(b), 16));
}
catch (Exception e) {
}
}
}
return sb.toString();
}
/**
* @return true if ate the ending EI delimiter
* @throws java.io.IOException
*/
boolean readLineForInlineImage(OutputStream out) throws IOException {
// The encoder might not have put EI on its own line (as it should),
// but might just put it right after the data
final int STATE_PRE_E = 0;
final int STATE_PRE_I = 1;
final int STATE_PRE_WHITESPACE = 2;
int state = STATE_PRE_E;
while (true) {
int c = reader.read();
if (c < 0)
break;
if (state == STATE_PRE_E && c == 'E') {
state++;
continue;
} else if (state == STATE_PRE_I && c == 'I') {
state++;
continue;
} else if (state == STATE_PRE_WHITESPACE && isWhitespace((char) (0xFF & c))) {
// It's hard to tell if the EI + whitespace is part of the
// image data or not, given that many PDFs are mis-encoded,
// and don't give whitespace when necessary. So, instead of
// assuming the need for whitespace, we're going to assume
// that this is the real EI, and apply a heuristic to prove
// ourselves wrong.
boolean imageDataFound = isStillInlineImageData(reader, 32);
if (imageDataFound) {
out.write('E');
out.write('I');
out.write(c);
state = STATE_PRE_E;
if (c == '\r' || c == '\n') {
break;
}
} else
return true;
} else {
// If we got a fragment of the EI<whitespace> sequence, then we withheld
// what we had so far. But if we're here, that fragment was incomplete,
// so that was actual embedded data, and not the delimiter, so we have
// to write it out.
if (state > STATE_PRE_E)
out.write('E');
if (state > STATE_PRE_I)
out.write('I');
state = STATE_PRE_E;
out.write((byte) c);
if (c == '\r' || c == '\n') {
break;
}
}
}
// If the input ends right after the EI, but with no whitespace,
// then we're still done
if (state == STATE_PRE_WHITESPACE)
return true;
return false;
}
/**
* @return
* @throws java.io.IOException
*/
byte readByte() throws IOException {
//return reader.readByte();
return (byte) reader.read();
}
/**
* White space characters defined by ' ', '\t', '\r', '\n', '\f'
*
* @param c
*/
public static final boolean isWhitespace(char c) {
return ((c == ' ') || (c == '\t') || (c == '\r') ||
(c == '\n') || (c == '\f'));
}
private static final boolean isDelimiter(char c) {
return ((c == '[') || (c == ']') ||
(c == '(') || (c == ')') ||
(c == '<') || (c == '>') ||
(c == '{') || (c == '}') ||
(c == '/') || (c == '%'));
}
/**
* This is not necessarily an exhaustive list of characters one would
* expect in a Content Stream, it's a heuristic for whether the data
* might still be part of an inline image, or the lattercontent stream
*/
private static boolean isExpectedInContentStream(char c) {
return ((c >= 'a' && c <= 'Z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
isWhitespace(c) ||
isDelimiter(c) ||
(c == '\\') ||
(c == '\'') ||
(c == '\"') ||
(c == '*') ||
(c == '.'));
}
/**
* We want to be conservative in deciding that we're still in the inline
* image, since we haven't found any of these cases before now.
*/
private static boolean isStillInlineImageData(
InputStream reader, int numBytesToCheck)
throws IOException {
boolean imageDataFound = false;
boolean onlyWhitespaceSoFar = true;
reader.mark(numBytesToCheck);
byte[] toCheck = new byte[numBytesToCheck];
int numReadToCheck = reader.read(toCheck);
for (int i = 0; i < numReadToCheck; i++) {
char charToCheck = (char) (((int) toCheck[i]) & 0xFF);
// If the very first thing we read is a Q or S token
boolean typicalTextTokenInContentStream =
(charToCheck == 'Q' || charToCheck == 'q' ||
charToCheck == 'S' || charToCheck == 's');
if (onlyWhitespaceSoFar &&
typicalTextTokenInContentStream &&
(i + 1 < numReadToCheck) &&
isWhitespace((char) (((int) toCheck[i + 1]) & 0xFF))) {
break;
}
if (!isWhitespace(charToCheck))
onlyWhitespaceSoFar = false;
// If we find some binary image data
if (!isExpectedInContentStream(charToCheck)) {
imageDataFound = true;
break;
}
}
reader.reset();
return imageDataFound;
}
/**
* @return
* @throws java.io.IOException
*/
String peek2() throws IOException {
reader.mark(2);
char c[] = new char[2];
c[0] = (char) reader.read();
c[1] = (char) reader.read();
String s = new String(c);
reader.reset();
return s;
}
private long captureStreamData(OutputStream out) throws IOException {
long numBytes = 0;
while (true) {
// read bytes
int nextByte = reader.read();
// look to see if we have the ending tag
if (nextByte == 'e') {
reader.mark(10);
if (reader.read() == 'n' &&
reader.read() == 'd' &&
reader.read() == 's' &&
reader.read() == 't' &&
reader.read() == 'r' &&
reader.read() == 'e' &&
reader.read() == 'a' &&
reader.read() == 'm') {
break;
} else {
reader.reset();
}
} else if (nextByte < 0)
break;
// write the bytes
if (out != null)
out.write(nextByte);
numBytes++;
}
return numBytes;
}
private long skipUntilEndstream(OutputStream out) throws IOException {
long skipped = 0L;
while (true) {
reader.mark(10);
// read bytes
int nextByte = reader.read();
if (nextByte == 'e' &&
reader.read() == 'n' &&
reader.read() == 'd' &&
reader.read() == 's' &&
reader.read() == 't' &&
reader.read() == 'r' &&
reader.read() == 'e' &&
reader.read() == 'a' &&
reader.read() == 'm') {
reader.reset();
break;
} else if (nextByte < 0)
break;
else {
if (nextByte == 0x0A || nextByte == 0x0D || nextByte == 0x20)
continue;
if (out != null)
out.write(nextByte);
}
skipped++;
}
return skipped;
}
}