ConformingPDFParser.java example

Explorer

PDF-to-unusual-HTML-master
- PDF-to-unusual-HTML
  - src
    - com
      - neumino
        pdftounusualhtml
        ConvertPdf.java
        InterruptTimerTask.java
        Jdbc.java
        Line.java
        Page.java
        Pdf2Json.java
        ProcessTimeout.java
        Structure.java
        Word.java
    - org
      - apache
        pdfbox
        ConvertColorspace.java
        Decrypt.java
        Encrypt.java
        ExportFDF.java
        ExportXFDF.java
        ExtractImages.java
        ExtractText.java
        ImportFDF.java
        ImportXFDF.java
        Overlay.java
        PDFBox.java
        PDFDebugger.java
        PDFMerger.java
        PDFReader.java
        PDFSplit.java
        PDFToImage.java
        PdfDecompressor.java
        PrintPDF.java
        TextToPDF.java
        Version.java
        WriteDecodedDoc.java
        cos
        COSArray.java
        COSBase.java
        COSBoolean.java
        COSDictionary.java
        COSDictionaryLateBinding.java
        COSDocument.java
        COSFloat.java
        COSInteger.java
        COSName.java
        COSNull.java
        COSNumber.java
        COSObject.java
        COSStream.java
        COSString.java
        COSUnread.java
        ICOSVisitor.java
        encoding
        AFMEncoding.java
        DictionaryEncoding.java
        Encoding.java
        EncodingManager.java
        MacRomanEncoding.java
        PdfDocEncoding.java
        StandardEncoding.java
        Type1Encoding.java
        WinAnsiEncoding.java
        conversion
        CJKConverter.java
        CJKEncoding.java
        CMapSubstitution.java
        EncodingConversionManager.java
        EncodingConverter.java
        encryption
        ARCFour.java
        DocumentEncryption.java
        PDFEncryption.java
        examples
        AbstractExample.java
        fdf
        PrintFields.java
        SetField.java
        pdmodel
        AddImageToPDF.java
        AddJavascript.java
        AddMessageToEachPage.java
        AddMetadataFromDocInfo.java
        Annotation.java
        CreateBlankPDF.java
        CreateBookmarks.java
        CreateLandscapePDF.java
        EmbeddedFiles.java
        ExtractMetadata.java
        GoToSecondBookmarkOnOpen.java
        HelloWorld.java
        HelloWorldTTF.java
        HelloWorldType1AfmPfb.java
        ImageToPDF.java
        PrintBookmarks.java
        PrintDocumentMetaData.java
        PrintURLs.java
        RemoveFirstPage.java
        ReplaceString.java
        ReplaceURLs.java
        RubberStamp.java
        RubberStampWithImage.java
        ShowColorBoxes.java
        UsingTextMatrix.java
        persistence
        CopyDoc.java
        signature
        ShowSignature.java
        util
        ExtractTextByArea.java
        PrintImageLocations.java
        PrintTextLocations.java
        RemoveAllText.java
        exceptions
        COSVisitorException.java
        CryptographyException.java
        InvalidPasswordException.java
        OutlineNotLocalException.java
        SignatureException.java
        WrappedException.java
        WrappedIOException.java
        filter
        ASCII85Filter.java
        ASCIIHexFilter.java
        CCITTFaxDecodeFilter.java
        CryptFilter.java
        DCTFilter.java
        Filter.java
        FilterManager.java
        FlateFilter.java
        IdentityFilter.java
        JBIG2Filter.java
        JPXFilter.java
        LZWDictionary.java
        LZWFilter.java
        LZWNode.java
        RunLengthDecodeFilter.java
        TIFFFaxDecoder.java
        io
        ASCII85InputStream.java
        ASCII85OutputStream.java
        ByteArrayPushBackInputStream.java
        FastByteArrayOutputStream.java
        NBitInputStream.java
        NBitOutputStream.java
        PushBackInputStream.java
        RandomAccess.java
        RandomAccessBuffer.java
        RandomAccessFile.java
        RandomAccessFileInputStream.java
        RandomAccessFileOutputStream.java
        pdfparser
        BaseParser.java
        ConformingPDFParser.java
        PDFObjectStreamParser.java
        PDFParser.java
        PDFStreamParser.java
        PDFXrefStreamParser.java
        VisualSignatureParser.java
        XrefTrailerResolver.java
        pdfviewer
        ArrayEntry.java
        MapEntry.java
        PDFPagePanel.java
        PDFTreeCellRenderer.java
        PDFTreeModel.java
        PageDrawer.java
        PageWrapper.java
        ReaderBottomPanel.java
        pdfwriter
        COSFilterInputStream.java
        COSStandardOutputStream.java
        COSWriter.java
        COSWriterXRefEntry.java
        ContentStreamWriter.java
        pdmodel
        ConformingPDDocument.java
        PDDestinationNameTreeNode.java
        PDDocument.java
        PDDocumentCatalog.java
        PDDocumentInformation.java
        PDDocumentNameDictionary.java
        PDEmbeddedFilesNameTreeNode.java
        PDJavascriptNameTreeNode.java
        PDPage.java
        PDPageNode.java
        PDPageable.java
        PDResources.java
        common
        COSArrayList.java
        COSDictionaryMap.java
        COSObjectable.java
        COSStreamArray.java
        DualCOSObjectable.java
        PDDestinationOrAction.java
        PDDictionaryWrapper.java
        PDMatrix.java
        PDMemoryStream.java
        PDMetadata.java
        PDNameTreeNode.java
        PDNamedTextStream.java
        PDNumberTreeNode.java
        PDObjectStream.java
        PDPageLabelRange.java
        PDPageLabels.java
        PDRange.java
        PDRectangle.java
        PDStream.java
        PDTextStream.java
        PDTypedDictionaryWrapper.java
        XrefEntry.java
        filespecification
        PDComplexFileSpecification.java
        PDEmbeddedFile.java
        PDFileSpecification.java
        PDSimpleFileSpecification.java
        function
        PDFunction.java
        PDFunctionType0.java
        PDFunctionType2.java
        PDFunctionType3.java
        PDFunctionType4.java
        documentinterchange
        logicalstructure
        PDAttributeObject.java
        PDDefaultAttributeObject.java
        PDMarkInfo.java
        PDMarkedContentReference.java
        PDObjectReference.java
        PDStructureElement.java
        PDStructureNode.java
        PDStructureTreeRoot.java
        PDUserAttributeObject.java
        PDUserProperty.java
        Revisions.java
        markedcontent
        PDMarkedContent.java
        prepress
        PDBoxStyle.java
        taggedpdf
        PDArtifactMarkedContent.java
        PDExportFormatAttributeObject.java
        PDFourColours.java
        PDLayoutAttributeObject.java
        PDListAttributeObject.java
        PDPrintFieldAttributeObject.java
        PDStandardAttributeObject.java
        PDTableAttributeObject.java
        StandardStructureTypes.java
        edit
        PDPageContentStream.java
        encryption
        AccessPermission.java
        BadSecurityHandlerException.java
        DecryptionMaterial.java
        PDCryptFilterDictionary.java
        PDEncryptionDictionary.java
        PDEncryptionManager.java
        PDStandardEncryption.java
        ProtectionPolicy.java
        PublicKeyDecryptionMaterial.java
        PublicKeyProtectionPolicy.java
        PublicKeyRecipient.java
        PublicKeySecurityHandler.java
        SecurityHandler.java
        SecurityHandlersManager.java
        StandardDecryptionMaterial.java
        StandardProtectionPolicy.java
        StandardSecurityHandler.java
        fdf
        FDFAnnotation.java
        FDFAnnotationCaret.java
        FDFAnnotationCircle.java
        FDFAnnotationFileAttachment.java
        FDFAnnotationFreeText.java
        FDFAnnotationHighlight.java
        FDFAnnotationInk.java
        FDFAnnotationLine.java
        FDFAnnotationPolygon.java
        FDFAnnotationPolyline.java
        FDFAnnotationSound.java
        FDFAnnotationSquare.java
        FDFAnnotationSquiggly.java
        FDFAnnotationStamp.java
        FDFAnnotationStrikeOut.java
        FDFAnnotationText.java
        FDFAnnotationUnderline.java
        FDFCatalog.java
        FDFDictionary.java
        FDFDocument.java
        FDFField.java
        FDFIconFit.java
        FDFJavaScript.java
        FDFNamedPageReference.java
        FDFOptionElement.java
        FDFPage.java
        FDFPageInfo.java
        FDFTemplate.java
        font
        FontManager.java
        PDCIDFont.java
        PDCIDFontType0Font.java
        PDCIDFontType2Font.java
        PDFont.java
        PDFontDescriptor.java
        PDFontDescriptorAFM.java
        PDFontDescriptorDictionary.java
        PDFontFactory.java
        PDMMType1Font.java
        PDSimpleFont.java
        PDTrueTypeFont.java
        PDType0Font.java
        PDType1AfmPfbFont.java
        PDType1CFont.java
        PDType1Font.java
        PDType3Font.java
        Type3StreamParser.java
        graphics
        PDExtendedGraphicsState.java
        PDFontSetting.java
        PDGraphicsState.java
        PDLineDashPattern.java
        PDShading.java
        color
        ColorSpaceCMYK.java
        ColorSpaceCalRGB.java
        PDCalGray.java
        PDCalRGB.java
        PDColorSpace.java
        PDColorSpaceFactory.java
        PDColorState.java
        PDDeviceCMYK.java
        PDDeviceGray.java
        PDDeviceN.java
        PDDeviceNAttributes.java
        PDDeviceRGB.java
        PDGamma.java
        PDICCBased.java
        PDIndexed.java
        PDLab.java
        PDPattern.java
        PDSeparation.java
        PDTristimulus.java
        optionalcontent
        PDOptionalContentGroup.java
        PDOptionalContentProperties.java
        predictor
        Average.java
        None.java
        Optimum.java
        Paeth.java
        PredictorAlgorithm.java
        Sub.java
        Up.java
        xobject
        CompositeImage.java
        PDCcitt.java
        PDInlinedImage.java
        PDJpeg.java
        PDPixelMap.java
        PDXObject.java
        PDXObjectForm.java
        PDXObjectImage.java
        interactive
        action
        PDActionFactory.java
        PDAdditionalActions.java
        PDAnnotationAdditionalActions.java
        PDDocumentCatalogAdditionalActions.java
        PDFormFieldAdditionalActions.java
        PDPageAdditionalActions.java
        type
        PDAction.java
        PDActionGoTo.java
        PDActionJavaScript.java
        PDActionLaunch.java
        PDActionRemoteGoTo.java
        PDActionURI.java
        PDURIDictionary.java
        PDWindowsLaunchParams.java
        annotation
        PDAnnotation.java
        PDAnnotationFileAttachment.java
        PDAnnotationLine.java
        PDAnnotationLink.java
        PDAnnotationMarkup.java
        PDAnnotationPopup.java
        PDAnnotationRubberStamp.java
        PDAnnotationSquareCircle.java
        PDAnnotationText.java
        PDAnnotationTextMarkup.java
        PDAnnotationUnknown.java
        PDAnnotationWidget.java
        PDAppearanceCharacteristicsDictionary.java
        PDAppearanceDictionary.java
        PDAppearanceStream.java
        PDBorderEffectDictionary.java
        PDBorderStyleDictionary.java
        PDExternalDataDictionary.java
        digitalsignature
        PDSignature.java
        SignatureInterface.java
        SignatureOptions.java
        documentnavigation
        destination
        PDDestination.java
        PDNamedDestination.java
        PDPageDestination.java
        PDPageFitDestination.java
        PDPageFitHeightDestination.java
        PDPageFitRectangleDestination.java
        PDPageFitWidthDestination.java
        PDPageXYZDestination.java
        outline
        PDDocumentOutline.java
        PDOutlineItem.java
        PDOutlineNode.java
        form
        PDAcroForm.java
        PDAppearance.java
        PDCheckbox.java
        PDChoiceButton.java
        PDChoiceField.java
        PDField.java
        PDFieldFactory.java
        PDPushButton.java
        PDRadioCollection.java
        PDSignature.java
        PDSignatureField.java
        PDTextbox.java
        PDUnknownField.java
        PDVariableText.java
        PDXFA.java
        measurement
        PDMeasureDictionary.java
        PDNumberFormatDictionary.java
        PDRectlinearMeasureDictionary.java
        PDViewportDictionary.java
        pagenavigation
        PDThread.java
        PDThreadBead.java
        viewerpreferences
        PDViewerPreferences.java
        markedcontent
        PDPropertyList.java
        text
        PDTextState.java
        persistence
        util
        COSHEXTable.java
        COSObjectKey.java
        util
        BitFlagHelper.java
        DateConverter.java
        ErrorLogger.java
        ExtensionFileFilter.java
        ICU4JImpl.java
        ImageParameters.java
        LayerUtility.java
        MapUtil.java
        Matrix.java
        PDFCloneUtility.java
        PDFHighlighter.java
        PDFImageWriter.java
        PDFMarkedContentExtractor.java
        PDFMergerUtility.java
        PDFOperator.java
        PDFStreamEngine.java
        PDFText2HTML.java
        PDFTextStripper.java
        PDFTextStripperByArea.java
        PageExtractor.java
        PositionWrapper.java
        ResourceLoader.java
        Splitter.java
        StringUtil.java
        TextNormalize.java
        TextPosition.java
        TextPositionComparator.java
        XMLUtil.java
        operator
        BeginMarkedContentSequence.java
        BeginMarkedContentSequenceWithProperties.java
        BeginText.java
        CloseAndStrokePath.java
        Concatenate.java
        EndMarkedContentSequence.java
        EndText.java
        GRestore.java
        GSave.java
        Invoke.java
        MoveAndShow.java
        MoveText.java
        MoveTextSetLeading.java
        NextLine.java
        OperatorProcessor.java
        SetCharSpacing.java
        SetGraphicsStateParameters.java
        SetHorizontalTextScaling.java
        SetLineCapStyle.java
        SetLineDashPattern.java
        SetLineJoinStyle.java
        SetLineMiterLimit.java
        SetLineWidth.java
        SetMatrix.java
        SetMoveAndShow.java
        SetNonStrokingCMYKColor.java
        SetNonStrokingCalRGBColor.java
        SetNonStrokingColor.java
        SetNonStrokingColorSpace.java
        SetNonStrokingDeviceN.java
        SetNonStrokingGrayColor.java
        SetNonStrokingICCBasedColor.java
        SetNonStrokingRGBColor.java
        SetNonStrokingSeparation.java
        SetStrokingCMYKColor.java
        SetStrokingCalRGBColor.java
        SetStrokingColor.java
        SetStrokingColorSpace.java
        SetStrokingDeviceN.java
        SetStrokingGrayColor.java
        SetStrokingICCBasedColor.java
        SetStrokingRGBColor.java
        SetStrokingSeparation.java
        SetTextFont.java
        SetTextLeading.java
        SetTextRenderingMode.java
        SetTextRise.java
        SetWordSpacing.java
        ShowText.java
        ShowTextGlyph.java
        pagedrawer
        AppendRectangleToPath.java
        BeginInlineImage.java
        ClipEvenOddRule.java
        ClipNonZeroRule.java
        CloseFillEvenOddAndStrokePath.java
        CloseFillNonZeroAndStrokePath.java
        ClosePath.java
        CurveTo.java
        CurveToReplicateFinalPoint.java
        CurveToReplicateInitialPoint.java
        EndPath.java
        FillEvenOddAndStrokePath.java
        FillEvenOddRule.java
        FillNonZeroAndStrokePath.java
        FillNonZeroRule.java
        Invoke.java
        LineTo.java
        MoveTo.java
        SHFill.java
        SetLineCapStyle.java
        SetLineDashPattern.java
        SetLineJoinStyle.java
        SetLineMiterLimit.java
        SetLineWidth.java
        StrokePath.java

/*
 *  Copyright 2010 adam.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */

package org.apache.pdfbox.pdfparser;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.cos.COSUnread;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.ConformingPDDocument;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.XrefEntry;
import org.apache.pdfbox.persistence.util.COSObjectKey;

/**
 * 
 * @author <a href="adam@apache.org">Adam Nichols</a>
 */
public class ConformingPDFParser extends BaseParser {
    protected RandomAccess inputFile;
    List<XrefEntry> xrefEntries;
    private long currentOffset;
    private ConformingPDDocument doc = null;
    private boolean throwNonConformingException = true;
    private boolean recursivlyRead = true;

    /**
     * Constructor.
     *
     * @param input The input stream that contains the PDF document.
     *
     * @throws IOException If there is an error initializing the stream.
     */
    public ConformingPDFParser(File inputFile) throws IOException {
        this.inputFile = new RandomAccessFile(inputFile, "r");
    }

    /**
     * This will parse the stream and populate the COSDocument object.  This will close
     * the stream when it is done parsing.
     *
     * @throws IOException If there is an error reading from the stream or corrupt data
     * is found.
     */
    public void parse() throws IOException {
        document = new COSDocument();
        doc = new ConformingPDDocument(document);
        currentOffset = inputFile.length()-1;
        long xRefTableLocation = parseTrailerInformation();
        currentOffset = xRefTableLocation;
        parseXrefTable();
        // now that we read the xref table and put null references in the doc,
        // we can deference those objects now.
        boolean oldValue = recursivlyRead;
        recursivlyRead = false;
        List<COSObjectKey> keys = doc.getObjectKeysFromPool();
        for(COSObjectKey key : keys) {
            // getObject will put it into the document's object pool for us
            getObject(key.getNumber(), key.getGeneration());
        }
        recursivlyRead = oldValue;
    }

    /**
     * This will get the document that was parsed.  parse() must be called before this is called.
     * When you are done with this document you must call close() on it to release
     * resources.
     *
     * @return The document that was parsed.
     *
     * @throws IOException If there is an error getting the document.
     */
    public COSDocument getDocument() throws IOException {
        if( document == null ) {
            throw new IOException( "You must call parse() before calling getDocument()" );
        }
        return document;
    }

    /**
     * This will get the PD document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    public PDDocument getPDDocument() throws IOException {
        return doc;
    }
    
    private boolean parseXrefTable() throws IOException {
        String currentLine = readLine();
        if(throwNonConformingException) {
            if(!"xref".equals(currentLine))
                throw new AssertionError("xref table not found.\nExpected: xref\nFound: "+currentLine);
        }

        int objectNumber = readInt();
        int entries = readInt();
        xrefEntries = new ArrayList<XrefEntry>(entries);
        for(int i=0; i<entries; i++)
            xrefEntries.add(new XrefEntry(objectNumber++, readInt(), readInt(), readLine()));
        
        return true;
    }

    protected long parseTrailerInformation() throws IOException, NumberFormatException {
        long xrefLocation = -1;
        consumeWhitespaceBackwards();
        String currentLine = readLineBackwards();
        if(throwNonConformingException) {
            if(!"%%EOF".equals(currentLine))
                throw new AssertionError("Invalid EOF marker.\nExpected: %%EOF\nFound: "+currentLine);
        }

        xrefLocation = readLongBackwards();
        currentLine = readLineBackwards();
        if(throwNonConformingException) {
            if(!"startxref".equals(currentLine))
                throw new AssertionError("Invalid trailer.\nExpected: startxref\nFound: "+currentLine);
        }

        document.setTrailer(readDictionaryBackwards());
        consumeWhitespaceBackwards();
        currentLine = readLineBackwards();
        if(throwNonConformingException) {
            if(!"trailer".equals(currentLine))
                throw new AssertionError("Invalid trailer.\nExpected: trailer\nFound: "+currentLine);
        }

        return xrefLocation;
    }
    
    protected byte readByteBackwards() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        currentOffset--;
        return singleByte;
    }

    protected byte readByte() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        currentOffset++;
        return singleByte;
    }

    protected String readBackwardUntilWhitespace() throws IOException {
        StringBuilder sb = new StringBuilder();
        byte singleByte = readByteBackwards();
        while(!isWhitespace(singleByte)) {
            sb.insert(0, (char)singleByte);
            singleByte = readByteBackwards();
        }
        return sb.toString();
    }

    /**
     * This will read all bytes (backwards) until a non-whitespace character is
     * found.  To save you an extra read, the non-whitespace character is
     * returned.  If the current character is not whitespace, this method will
     * just return the current char.
     * @return the first non-whitespace character found
     * @throws IOException if there is an error reading from the file
     */
    protected byte consumeWhitespaceBackwards() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        if(!isWhitespace(singleByte))
            return singleByte;

        // we have some whitespace, let's consume it
        while(isWhitespace(singleByte)) {
            singleByte = readByteBackwards();
        }
        // readByteBackwards will decrement the currentOffset to point the byte
        // before the one just read, so we increment it back to the current byte
        currentOffset++;
        return singleByte;
    }

    /**
     * This will read all bytes until a non-whitespace character is
     * found.  To save you an extra read, the non-whitespace character is
     * returned.  If the current character is not whitespace, this method will
     * just return the current char.
     * @return the first non-whitespace character found
     * @throws IOException if there is an error reading from the file
     */
    protected byte consumeWhitespace() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        if(!isWhitespace(singleByte))
            return singleByte;

        // we have some whitespace, let's consume it
        while(isWhitespace(singleByte)) {
            singleByte = readByte();
        }
        // readByte() will increment the currentOffset to point the byte
        // after the one just read, so we decrement it back to the current byte
        currentOffset--;
        return singleByte;
    }

    /**
     * This will consume any whitespace, read in bytes until whitespace is found
     * again and then parse the characters which have been read as a long.  The
     * current offset will then point at the first whitespace character which
     * preceeds the number.
     * @return the parsed number
     * @throws IOException if there is an error reading from the file
     * @throws NumberFormatException if the bytes read can not be converted to a number
     */
    protected long readLongBackwards() throws IOException, NumberFormatException {
        StringBuilder sb = new StringBuilder();
        consumeWhitespaceBackwards();
        byte singleByte = readByteBackwards();
        while(!isWhitespace(singleByte)) {
            sb.insert(0, (char)singleByte);
            singleByte = readByteBackwards();
        }
        if(sb.length() == 0)
            throw new AssertionError("Number not found.  Expected number at offset: " + currentOffset);
        return Long.parseLong(sb.toString());
    }

    @Override
    protected int readInt() throws IOException {
        StringBuilder sb = new StringBuilder();
        consumeWhitespace();
        byte singleByte = readByte();
        while(!isWhitespace(singleByte)) {
            sb.append((char)singleByte);
            singleByte = readByte();
        }
        if(sb.length() == 0)
            throw new AssertionError("Number not found.  Expected number at offset: " + currentOffset);
        return Integer.parseInt(sb.toString());
    }

    /**
     * This will read in a number and return the COS version of the number (be
     * it a COSInteger or a COSFloat).
     * @return the COSNumber which was read/parsed
     * @throws IOException
     */
    protected COSNumber readNumber() throws IOException {
        StringBuilder sb = new StringBuilder();
        consumeWhitespace();
        byte singleByte = readByte();
        while(!isWhitespace(singleByte)) {
            sb.append((char)singleByte);
            singleByte = readByte();
        }
        if(sb.length() == 0)
            throw new AssertionError("Number not found.  Expected number at offset: " + currentOffset);
        return parseNumber(sb.toString());
    }

    protected COSNumber parseNumber(String number) throws IOException {
        if(number.matches("^[0-9]+$"))
            return COSInteger.get(number);
        return new COSFloat(Float.parseFloat(number));
    }

    protected COSBase processCosObject(String string) throws IOException {
        if(string != null && string.endsWith(">")) {
            // string of hex codes
            return COSString.createFromHexString(string.replaceAll("^<", "").replaceAll(">$", ""));
        }
        return null;
    }

    protected COSBase readObjectBackwards() throws IOException {
        COSBase obj = null;
        consumeWhitespaceBackwards();
        String lastSection = readBackwardUntilWhitespace();
        if("R".equals(lastSection)) {
            // indirect reference
            long gen = readLongBackwards();
            long number = readLongBackwards();
            // We just put a placeholder in the pool for now, we'll read the data later
            doc.putObjectInPool(new COSUnread(), number, gen);
            obj = new COSUnread(number, gen, this);
        } else if(">>".equals(lastSection)) {
            // dictionary
            throw new RuntimeException("Not yet implemented");
        } else if(lastSection != null && lastSection.endsWith("]")) {
            // array
            COSArray array = new COSArray();
            lastSection = lastSection.replaceAll("]$", "");
            while(!lastSection.startsWith("[")) {
                if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string
                    array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", "")));
                lastSection = readBackwardUntilWhitespace();
            }
            lastSection = lastSection.replaceAll("^\\[", "");
            if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string
                array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", "")));
            obj = array;
        } else if(lastSection != null && lastSection.endsWith(">")) {
            // string of hex codes
            obj = processCosObject(lastSection);
        } else {
            // try a number, otherwise fall back on a string
            try {
                Long.parseLong(lastSection);
                obj = COSNumber.get(lastSection);
            } catch(NumberFormatException e) {
                throw new RuntimeException("Not yet implemented");
            }
        }

        return obj;
    }

    protected COSName readNameBackwards() throws IOException {
        String name = readBackwardUntilWhitespace();
        name = name.replaceAll("^/", "");
        return COSName.getPDFName(name);
    }

    public COSBase getObject(long objectNumber, long generation) throws IOException {
        // we could optionally, check to see if parse() have been called &
        // throw an exception here, but I don't think that's really necessary
        XrefEntry entry = xrefEntries.get((int)objectNumber);
        currentOffset = entry.getByteOffset();
        return readObject(objectNumber, generation);
    }

    /**
     * This will read an object from the inputFile at whatever our currentOffset
     * is.  If the object and generation are not the expected values and this
     * object is set to throw an exception for non-conforming documents, then an
     * exception will be thrown.
     * @param objectNumber the object number you expect to read
     * @param generation the generation you expect this object to be
     * @return
     */
    public COSBase readObject(long objectNumber, long generation) throws IOException {
        // when recursivly reading, we always pull the object from the filesystem
        if(document != null && recursivlyRead) {
            // check to see if it is in the document cache before hitting the filesystem
            COSBase obj = doc.getObjectFromPool(objectNumber, generation);
            if(obj != null)
                return obj;
        }

        int actualObjectNumber = readInt();
        if(objectNumber != actualObjectNumber)
            if(throwNonConformingException)
                throw new AssertionError("Object numer expected was " +
                        objectNumber + " but actual was " + actualObjectNumber);
        consumeWhitespace();

        int actualGeneration = readInt();
        if(generation != actualGeneration)
            if(throwNonConformingException)
                throw new AssertionError("Generation expected was " +
                        generation + " but actual was " + actualGeneration);
        consumeWhitespace();

        String obj = readWord();
        if(!"obj".equals(obj))
            if(throwNonConformingException)
                throw new AssertionError("Expected keyword 'obj' but found " + obj);
        
        // put placeholder object in doc to prevent infinite recursion
        // e.g. read Root -> dereference object -> read object which has /Parent -> GOTO read Root
        doc.putObjectInPool(new COSObject(null), objectNumber, generation);
        COSBase object = readObject();
        doc.putObjectInPool(object, objectNumber, generation);
        return object;
    }

    /**
     * This actually reads the object data.
     * @return the object which is read
     * @throws IOException
     */
    protected COSBase readObject() throws IOException {
        consumeWhitespace();
        String string = readWord();
        if(string.startsWith("<<")) {
            // this is a dictionary
            COSDictionary dictionary = new COSDictionary();
            boolean atEndOfDictionary = false;
            // remove the marker for the beginning of the dictionary
            string = string.replaceAll("^<<", "");
            
            if("".equals(string) || string.matches("^\\w$"))
                string = readWord().trim();
            while(!atEndOfDictionary) {
                COSName name = COSName.getPDFName(string);
                COSBase object = readObject();
                dictionary.setItem(name, object);

                byte singleByte = consumeWhitespace();
                if(singleByte == '>') {
                    readByte(); // get rid of the second '>'
                    atEndOfDictionary = true;
                }
                if(!atEndOfDictionary)
                    string = readWord().trim();
            }
            return dictionary;
        } else if(string.startsWith("/")) {
            // it's a dictionary label. i.e. /Type or /Pages or something similar
            COSBase name = COSName.getPDFName(string);
            return name;
        } else if(string.startsWith("-")) {
            // it's a negitive number
            return parseNumber(string);
        } else if(string.charAt(0) >= '0' && string.charAt(0) <= '9' ) {
            // it's a COSInt or COSFloat, or a weak reference (i.e. "3 0 R")
            // we'll have to peek ahead a little to see if it's a reference or not
            long tempOffset = this.currentOffset;
            consumeWhitespace();
            String tempString = readWord();
            if(tempString.matches("^[0-9]+$")) {
                // it is an int, might be a weak reference...
                tempString = readWord();
                if(!"R".equals(tempString)) {
                    // it's just a number, not a weak reference
                    this.currentOffset = tempOffset;
                    return parseNumber(string);
                }
            } else {
                // it's just a number, not a weak reference
                this.currentOffset = tempOffset;
                return parseNumber(string);
            }

            // it wasn't a number, so we need to parse the weak-reference
            this.currentOffset = tempOffset;
            int number = Integer.parseInt(string);
            int gen = readInt();
            String r = readWord();

            if(!"R".equals(r))
                if(throwNonConformingException)
                    throw new AssertionError("Expected keyword 'R' but found " + r);

            if(recursivlyRead) {
                // seek to the object, read it, seek back to current location
                long tempLocation = this.currentOffset;
                this.currentOffset = this.xrefEntries.get(number).getByteOffset();
                COSBase returnValue = readObject(number, gen);
                this.currentOffset = tempLocation;
                return returnValue;
            } else {
                // Put a COSUnknown there as a placeholder
                COSObject obj = new COSObject(new COSUnread());
                obj.setObjectNumber(COSInteger.get(number));
                obj.setGenerationNumber(COSInteger.get(gen));
                return obj;
            }
        } else if(string.startsWith("]")) {
            // end of an array, just return null
            if("]".equals(string))
                return null;
            int oldLength = string.length();
            this.currentOffset -= oldLength;
            return null;
        } else if(string.startsWith("[")) {
            // array of values
            // we'll just pay attention to the first part (this is in case there
            // is no whitespace between the "[" and the first element)
            int oldLength = string.length();
            string = "[";
            this.currentOffset -= (oldLength - string.length() + 1);

            COSArray array = new COSArray();
            COSBase object = readObject();
            while(object != null) {
                array.add(object);
                object = readObject();
            }
            return array;
        } else if(string.startsWith("(")) {
            // this is a string (not hex encoded), strip off the '(' and read until ')'
            StringBuilder sb = new StringBuilder(string.substring(1));
            byte singleByte = readByte();
            while(singleByte != ')') {
                sb.append((char)singleByte);
                singleByte = readByte();
            }
            return new COSString(sb.toString());
        } else {
            throw new RuntimeException("Not yet implemented: " + string
                    + " loation=" + this.currentOffset);
        }
    }

    /**
     * This will read the next string from the stream.
     * @return The string that was read from the stream.
     * @throws IOException If there is an error reading from the stream.
     */
    @Override
    protected String readString() throws IOException {
        consumeWhitespace();
        StringBuilder buffer = new StringBuilder();
        int c = pdfSource.read();
        while(!isEndOfName((char)c) && !isClosing(c) && c != -1) {
            buffer.append( (char)c );
            c = pdfSource.read();
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        return buffer.toString();
    }

    protected COSDictionary readDictionaryBackwards() throws IOException {
        COSDictionary dict = new COSDictionary();
        
        // consume the last two '>' chars which signify the end of the dictionary
        consumeWhitespaceBackwards();
        byte singleByte = readByteBackwards();
        if(throwNonConformingException) {
            if(singleByte != '>')
                throw new AssertionError("");
        }
        singleByte = readByteBackwards();
        if(throwNonConformingException) {
            if(singleByte != '>')
                throw new AssertionError("");
        }
        
        // check to see if we're at the end of the dictionary
        boolean atEndOfDictionary = false;
        singleByte = consumeWhitespaceBackwards();
        if(singleByte == '<') {
            inputFile.seek(currentOffset-1);
            atEndOfDictionary =  ((byte)inputFile.read()) == '<';
        }

        COSDictionary backwardsDictionary = new COSDictionary();
        // while we're not at the end of the dictionary, read in entries
        while(!atEndOfDictionary) {
            COSBase object = readObjectBackwards();
            COSName name = readNameBackwards();
            backwardsDictionary.setItem(name, object);
            
            singleByte = consumeWhitespaceBackwards();
            if(singleByte == '<') {
                inputFile.seek(currentOffset-1);
                atEndOfDictionary =  ((byte)inputFile.read()) == '<';
            }
        }

        // the dictionaries preserve the order keys were added, as such we shall
        // add them in the proper order, not the reverse order
        Set<COSName> backwardsKeys = backwardsDictionary.keySet();
        for(int i = backwardsKeys.size()-1; i >=0; i--)
            dict.setItem((COSName)backwardsKeys.toArray()[i], backwardsDictionary.getItem((COSName)backwardsKeys.toArray()[i]));
        
        // consume the last two '<' chars
        readByteBackwards();
        readByteBackwards();

        return dict;
    }

    /**
     * This will read a line starting with the byte at offset and going 
     * backwards until it finds a newline.  This should only be used if we are
     * certain that the data will only be text, and not binary data.
     * 
     * @param offset the location of the file where we should start reading
     * @return the string which was read
     * @throws IOException if there was an error reading data from the file
     */
    protected String readLineBackwards() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean endOfObject = false;
        
        do {
            // first we read the %%EOF marker
            byte singleByte = readByteBackwards();
            if(singleByte == '\n') {
                // if ther's a preceeding \r, we'll eat that as well
                inputFile.seek(currentOffset);
                if((byte)inputFile.read() == '\r')
                    currentOffset--;
                endOfObject = true;
            } else if(singleByte == '\r') {
                endOfObject = true;
            } else {
                sb.insert(0, (char)singleByte);
            }
        } while(!endOfObject);
        
        return sb.toString();
    }

    /**
     * This will read a line starting with the byte at offset and going
     * forward until it finds a newline.  This should only be used if we are
     * certain that the data will only be text, and not binary data.
     * @param offset the location of the file where we should start reading
     * @return the string which was read
     * @throws IOException if there was an error reading data from the file
     */
    @Override
    protected String readLine() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean endOfLine = false;

        do {
            // first we read the %%EOF marker
            byte singleByte = readByte();
            if(singleByte == '\n') {
                // if ther's a preceeding \r, we'll eat that as well
                inputFile.seek(currentOffset);
                if((byte)inputFile.read() == '\r')
                    currentOffset++;
                endOfLine = true;
            } else if(singleByte == '\r') {
                endOfLine = true;
            } else {
                sb.append((char)singleByte);
            }
        } while(!endOfLine);

        return sb.toString();
    }

    protected String readWord() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean stop = true;
        do {
            byte singleByte = readByte();
            stop = this.isWhitespace(singleByte);

            // there are some additional characters which indicate the next element/word has begun
            // ignore the first char we read, b/c the first char is the beginnging of this object, not the next one
            if(!stop && sb.length() > 0) {
                stop = singleByte == '/' || singleByte == '['
                        || singleByte == ']'
                        || (singleByte == '>' && !">".equals(sb.toString()));
                if(stop) // we're stopping on a non-whitespace char, decrement the
                    this.currentOffset--; // counter so we don't miss this character
            }
            if(!stop)
                sb.append((char)singleByte);
        } while(!stop);

        return sb.toString();
    }

    /**
     * @return the recursivlyRead
     */
    public boolean isRecursivlyRead() {
        return recursivlyRead;
    }

    /**
     * @param recursivlyRead the recursivlyRead to set
     */
    public void setRecursivlyRead(boolean recursivlyRead) {
        this.recursivlyRead = recursivlyRead;
    }
}