TaggedPdfReaderTool.java example

Explorer

wgen-iText-master
- src
  - core
    - com
      - itextpdf
        text
        Anchor.java
        Annotation.java
        BadElementException.java
        BaseColor.java
        Chapter.java
        ChapterAutoNumber.java
        Chunk.java
        DocListener.java
        DocWriter.java
        Document.java
        DocumentException.java
        Element.java
        ElementListener.java
        ElementTags.java
        ExceptionConverter.java
        Font.java
        FontFactory.java
        FontFactoryImp.java
        FontProvider.java
        GreekList.java
        Header.java
        Image.java
        ImgCCITT.java
        ImgJBIG2.java
        ImgRaw.java
        ImgTemplate.java
        ImgWMF.java
        Jpeg.java
        Jpeg2000.java
        LargeElement.java
        List.java
        ListItem.java
        MarkedObject.java
        MarkedSection.java
        Meta.java
        PageSize.java
        Paragraph.java
        Phrase.java
        Rectangle.java
        RectangleReadOnly.java
        RomanList.java
        RtfElementInterface.java
        Section.java
        SpecialSymbol.java
        SplitCharacter.java
        TextElementArray.java
        Utilities.java
        ZapfDingbatsList.java
        ZapfDingbatsNumberList.java
        error_messages
        MessageLocalization.java
        exceptions
        BadPasswordException.java
        IllegalPdfSyntaxException.java
        InvalidPdfException.java
        UnsupportedPdfException.java
        factories
        ElementFactory.java
        GreekAlphabetFactory.java
        RomanAlphabetFactory.java
        RomanNumberFactory.java
        html
        HtmlEncoder.java
        HtmlTags.java
        Markup.java
        WebColors.java
        simpleparser
        ALink.java
        ChainedProperties.java
        FactoryProperties.java
        HTMLWorker.java
        ImageProvider.java
        Img.java
        IncCell.java
        IncTable.java
        StyleSheet.java
        pdf
        AcroFields.java
        ArabicLigaturizer.java
        AsianFontMapper.java
        BadPdfFormatException.java
        Barcode.java
        Barcode128.java
        Barcode39.java
        BarcodeCodabar.java
        BarcodeDatamatrix.java
        BarcodeEAN.java
        BarcodeEANSUPP.java
        BarcodeInter25.java
        BarcodePDF417.java
        BarcodePostnet.java
        BarcodeQRCode.java
        BaseField.java
        BaseFont.java
        BidiLine.java
        BidiOrder.java
        ByteBuffer.java
        CFFFont.java
        CFFFontSubset.java
        CJKFont.java
        CMYKColor.java
        CMapAwareDocumentFont.java
        ColorDetails.java
        ColumnText.java
        DefaultFontMapper.java
        DefaultSplitCharacter.java
        DocumentFont.java
        EnumerateTTC.java
        ExtendedColor.java
        ExtraEncoding.java
        FdfReader.java
        FdfWriter.java
        FontDetails.java
        FontMapper.java
        FontSelector.java
        GlyphList.java
        GrayColor.java
        HyphenationAuto.java
        HyphenationEvent.java
        ICC_Profile.java
        IntHashtable.java
        LZWDecoder.java
        MappedRandomAccessFile.java
        MultiColumnText.java
        OcspClient.java
        OcspClientBouncyCastle.java
        OutputStreamCounter.java
        OutputStreamEncryption.java
        PRAcroForm.java
        PRIndirectReference.java
        PRStream.java
        PRTokeniser.java
        PageResources.java
        PatternColor.java
        PdfAcroForm.java
        PdfAction.java
        PdfAnnotation.java
        PdfAppearance.java
        PdfArray.java
        PdfBoolean.java
        PdfBorderArray.java
        PdfBorderDictionary.java
        PdfChunk.java
        PdfColor.java
        PdfContentByte.java
        PdfContentParser.java
        PdfContents.java
        PdfCopy.java
        PdfCopyFields.java
        PdfCopyFieldsImp.java
        PdfCopyForms.java
        PdfCopyFormsImp.java
        PdfDashPattern.java
        PdfDate.java
        PdfDestination.java
        PdfDeveloperExtension.java
        PdfDictionary.java
        PdfDocument.java
        PdfEFStream.java
        PdfEncodings.java
        PdfEncryption.java
        PdfEncryptor.java
        PdfException.java
        PdfFileSpecification.java
        PdfFont.java
        PdfFormField.java
        PdfFormXObject.java
        PdfFunction.java
        PdfGState.java
        PdfGraphics2D.java
        PdfICCBased.java
        PdfImage.java
        PdfImportedPage.java
        PdfIndirectObject.java
        PdfIndirectReference.java
        PdfLayer.java
        PdfLayerMembership.java
        PdfLine.java
        PdfLister.java
        PdfLiteral.java
        PdfMediaClipData.java
        PdfName.java
        PdfNameTree.java
        PdfNull.java
        PdfNumber.java
        PdfNumberTree.java
        PdfOCG.java
        PdfOCProperties.java
        PdfObject.java
        PdfOutline.java
        PdfPCell.java
        PdfPCellEvent.java
        PdfPKCS7.java
        PdfPRow.java
        PdfPSXObject.java
        PdfPTable.java
        PdfPTableEvent.java
        PdfPTableEventSplit.java
        PdfPage.java
        PdfPageElement.java
        PdfPageEvent.java
        PdfPageEventHelper.java
        PdfPageLabels.java
        PdfPages.java
        PdfPattern.java
        PdfPatternPainter.java
        PdfPrinterGraphics2D.java
        PdfPublicKeyRecipient.java
        PdfPublicKeySecurityHandler.java
        PdfReader.java
        PdfReaderInstance.java
        PdfRectangle.java
        PdfRendition.java
        PdfResources.java
        PdfShading.java
        PdfShadingPattern.java
        PdfSigGenericPKCS.java
        PdfSignature.java
        PdfSignatureAppearance.java
        PdfSmartCopy.java
        PdfSpotColor.java
        PdfStamper.java
        PdfStamperImp.java
        PdfStream.java
        PdfString.java
        PdfStructureElement.java
        PdfStructureTreeRoot.java
        PdfTemplate.java
        PdfTextArray.java
        PdfTransition.java
        PdfTransparencyGroup.java
        PdfVisibilityExpression.java
        PdfWriter.java
        PdfXConformanceException.java
        Pfm2afm.java
        PushbuttonField.java
        RadioCheckField.java
        RandomAccessFileOrArray.java
        SequenceList.java
        ShadingColor.java
        SimpleBookmark.java
        SimpleNamedDestination.java
        SpotColor.java
        StampContent.java
        StandardDecryption.java
        TSAClient.java
        TSAClientBouncyCastle.java
        TextField.java
        TrueTypeFont.java
        TrueTypeFontSubSet.java
        TrueTypeFontUnicode.java
        Type1Font.java
        Type3Font.java
        Type3Glyph.java
        VerticalText.java
        XfaForm.java
        XfdfReader.java
        codec
        Base64.java
        BmpImage.java
        CCITTG4Encoder.java
        GifImage.java
        JBIG2Image.java
        JBIG2SegmentReader.java
        PngImage.java
        TIFFConstants.java
        TIFFDirectory.java
        TIFFFaxDecoder.java
        TIFFField.java
        TIFFLZWDecoder.java
        TiffImage.java
        wmf
        InputMeta.java
        MetaBrush.java
        MetaDo.java
        MetaFont.java
        MetaObject.java
        MetaPen.java
        MetaState.java
        Point.java
        collection
        PdfCollection.java
        PdfCollectionField.java
        PdfCollectionItem.java
        PdfCollectionSchema.java
        PdfCollectionSort.java
        PdfTargetDictionary.java
        crypto
        AESCipher.java
        ARCFOUREncryption.java
        IVGenerator.java
        draw
        DottedLineSeparator.java
        DrawInterface.java
        LineSeparator.java
        VerticalPositionMark.java
        events
        FieldPositioningEvents.java
        IndexEvents.java
        PdfPCellEventForwarder.java
        PdfPTableEventForwarder.java
        PdfPageEventForwarder.java
        fonts
        FontsResourceAnchor.java
        cmaps
        CMap.java
        CMapParser.java
        CodespaceRange.java
        hyphenation
        ByteVector.java
        CharVector.java
        Hyphen.java
        Hyphenation.java
        HyphenationException.java
        HyphenationTree.java
        Hyphenator.java
        PatternConsumer.java
        SimplePatternParser.java
        TernaryTree.java
        interfaces
        PdfAnnotations.java
        PdfDocumentActions.java
        PdfEncryptionSettings.java
        PdfPageActions.java
        PdfRunDirection.java
        PdfVersion.java
        PdfViewerPreferences.java
        PdfXConformance.java
        internal
        PdfAnnotationsImp.java
        PdfVersionImp.java
        PdfViewerPreferencesImp.java
        PdfXConformanceImp.java
        PolylineShape.java
        PolylineShapeIterator.java
        parser
        ContentByteUtils.java
        ContentOperator.java
        FilteredRenderListener.java
        FilteredTextRenderListener.java
        GraphicsState.java
        ImageRenderInfo.java
        LineSegment.java
        LocationTextExtractionStrategy.java
        MarkedContentInfo.java
        MarkedContentRenderFilter.java
        Matrix.java
        PdfContentReaderTool.java
        PdfContentStreamProcessor.java
        PdfImageObject.java
        PdfReaderContentParser.java
        PdfTextExtractor.java
        RegionTextRenderFilter.java
        RenderFilter.java
        RenderListener.java
        SimpleTextExtractionStrategy.java
        TaggedPdfReaderTool.java
        TextExtractionStrategy.java
        TextMarginFinder.java
        TextRenderInfo.java
        Vector.java
        XObjectDoHandler.java
        qrcode
        BitArray.java
        BitMatrix.java
        BitVector.java
        BlockPair.java
        ByteArray.java
        ByteMatrix.java
        CharacterSetECI.java
        EncodeHintType.java
        Encoder.java
        ErrorCorrectionLevel.java
        FormatInformation.java
        GF256.java
        GF256Poly.java
        MaskUtil.java
        MatrixUtil.java
        Mode.java
        QRCode.java
        QRCodeWriter.java
        ReedSolomonEncoder.java
        ReedSolomonException.java
        Version.java
        WriterException.java
        richmedia
        CuePoint.java
        RichMediaActivation.java
        RichMediaAnimation.java
        RichMediaAnnotation.java
        RichMediaCommand.java
        RichMediaConfiguration.java
        RichMediaDeactivation.java
        RichMediaExecuteAction.java
        RichMediaInstance.java
        RichMediaParams.java
        RichMediaPosition.java
        RichMediaPresentation.java
        RichMediaWindow.java
        xml
        XmlDomWriter.java
        simpleparser
        EntitiesToSymbol.java
        EntitiesToUnicode.java
        IanaEncodings.java
        SimpleXMLDocHandler.java
        SimpleXMLDocHandlerComment.java
        SimpleXMLParser.java
        xmp
        DublinCoreSchema.java
        LangAlt.java
        PdfA1Schema.java
        PdfSchema.java
        XmpArray.java
        XmpBasicSchema.java
        XmpMMSchema.java
        XmpReader.java
        XmpSchema.java
        XmpWriter.java
  - rups
    - com
      - itextpdf
        rups
        Rups.java
        controller
        PdfReaderController.java
        RupsController.java
        io
        FileChooserAction.java
        FileCloseAction.java
        OutputStreamResource.java
        TextAreaOutputStream.java
        filters
        PdfFilter.java
        model
        BackgroundTask.java
        IndirectObjectFactory.java
        ObjectLoader.java
        PdfFile.java
        Permissions.java
        ProgressDialog.java
        TreeNodeFactory.java
        XfaFile.java
        view
        Console.java
        MessageAction.java
        PageSelectionListener.java
        RupsMenuBar.java
        icons
        IconActionListener.java
        IconButton.java
        IconFetcher.java
        IconTreeCellRenderer.java
        IconTreeNode.java
        itext
        FormTree.java
        OutlineTree.java
        PagesTable.java
        PdfObjectPanel.java
        PdfTree.java
        StreamTextArea.java
        XRefTable.java
        XfaTextArea.java
        XfaTree.java
        treenodes
        FormTreeNode.java
        OutlineTreeNode.java
        PdfObjectTreeNode.java
        PdfPageTreeNode.java
        PdfPagesTreeNode.java
        PdfTrailerTreeNode.java
        XdpTreeNode.java
        XfaTreeNode.java
        models
        DictionaryTableModel.java
        JTableAutoModel.java
        JTableAutoModelInterface.java
        PdfArrayTableModel.java
- test
  - core
    - com
      - itextpdf
        testutils
        ClearOutTempFilesDuringTest.java
        TestResourceUtils.java
        text
        pdf
        AcroFieldsTest.java
        BookmarksTest.java
        DocumentFontTest.java
        MetaDataTest.java
        PdfReaderTest.java
        PdfRectangleTest.java
        TestPdfCopyAndStamp.java
        UnbalancedOperatorsTest.java
        fonts
        cmaps
        CMapTest.java
        parser
        FilteredTextRenderListenerTest.java
        LocationTextExtractionStrategyTest.java
        MatrixTest.java
        PdfContentStreamProcessorTest.java
        PdfTextExtractorEncodingsTest.java
        SimpleTextExtractionStrategyTest.java
        TextMarginFinderTest.java
        VectorTest.java

/*
 * $Id: ContentOperator.java 4242 2010-01-02 23:22:20Z xlv $
 *
 * This file is part of the iText project.
 * Copyright (c) 1998-2009 1T3XT BVBA
 * Authors: Bruno Lowagie, et al.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License version 3
 * as published by the Free Software Foundation with the addition of the
 * following permission added to Section 15 as permitted in Section 7(a):
 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Affero General Public License for more details.
 * You should have received a copy of the GNU Affero General Public License
 * along with this program; if not, see http://www.gnu.org/licenses or write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
 * http://itextpdf.com/terms-of-use/
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU Affero General Public License.
 *
 * In accordance with Section 7(b) of the GNU Affero General Public License,
 * you must retain the producer line in every PDF that is created or manipulated
 * using iText.
 *
 * You can be released from the requirements of the license by purchasing
 * a commercial license. Buying such a license is mandatory as soon as you
 * develop commercial activities involving the iText software without
 * disclosing the source code of your own applications.
 * These activities include: offering paid services to customers as an ASP,
 * serving PDFs on the fly in a web application, shipping iText with a closed
 * source product.
 *
 * For more information, please contact iText Software Corp. at this
 * address: sales@itextpdf.com
 */
package com.itextpdf.text.pdf.parser;

import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;

import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfArray;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfNumber;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.xml.simpleparser.SimpleXMLParser;

/**
 * Converts a tagged PDF document into an XML file.
 * 
 * @since 5.0.2
 */
public class TaggedPdfReaderTool {

	/** The reader object from which the content streams are read. */
	PdfReader reader;
	/** The writer object to which the XML will be written */
	PrintWriter out;

	/**
	 * Parses a string with structured content.
	 * 
	 * @param reader
	 *            the PdfReader that has access to the PDF file
	 * @param os
	 *            the OutputStream to which the resulting xml will be written
	 */
	public void convertToXml(PdfReader reader, OutputStream os)
			throws IOException {
		this.reader = reader;
		out = new PrintWriter(os);
		// get the StructTreeRoot from the root object
		PdfDictionary catalog = reader.getCatalog();
		PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT);
		// Inspect the child or children of the StructTreeRoot
		inspectChild(struct.getDirectObject(PdfName.K));
		out.flush();
		out.close();
	}

	/**
	 * Inspects a child of a structured element. This can be an array or a
	 * dictionary.
	 * 
	 * @param k
	 *            the child to inspect
	 * @throws IOException
	 */
	public void inspectChild(PdfObject k) throws IOException {
		if (k == null)
			return;
		if (k instanceof PdfArray)
			inspectChildArray((PdfArray) k);
		else if (k instanceof PdfDictionary)
			inspectChildDictionary((PdfDictionary) k);
	}

	/**
	 * If the child of a structured element is an array, we need to loop over
	 * the elements.
	 * 
	 * @param k
	 *            the child array to inspect
	 */
	public void inspectChildArray(PdfArray k) throws IOException {
		if (k == null)
			return;
		for (int i = 0; i < k.size(); i++) {
			inspectChild(k.getDirectObject(i));
		}
	}

	/**
	 * If the child of a structured element is a dictionary, we inspect the
	 * child; we may also draw a tag.
	 * 
	 * @param k
	 *            the child dictionary to inspect
	 */
	public void inspectChildDictionary(PdfDictionary k) throws IOException {
		if (k == null)
			return;
		PdfName s = k.getAsName(PdfName.S);
		if (s != null) {
			String tag = s.toString().substring(1);
			out.print("<");
			out.print(tag);
			out.print(">");
			PdfDictionary dict = k.getAsDict(PdfName.PG);
			if (dict != null)
				parseTag(tag, k.getDirectObject(PdfName.K), dict);
			inspectChild(k.get(PdfName.K));
			out.print("</");
			out.print(tag);
			out.println(">");
		} else
			inspectChild(k.get(PdfName.K));
	}

	/**
	 * Searches for a tag in a page.
	 * 
	 * @param tag
	 *            the name of the tag
	 * @param object
	 *            an identifier to find the marked content
	 * @param page
	 *            a page dictionary
	 * @throws IOException
	 */
	public void parseTag(String tag, PdfObject object, PdfDictionary page)
			throws IOException {
		PRStream stream = (PRStream) page.getAsStream(PdfName.CONTENTS);
		// if the identifier is a number, we can extract the content right away
		if (object instanceof PdfNumber) {
			PdfNumber mcid = (PdfNumber) object;
			RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());
			TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
			FilteredTextRenderListener listener = new FilteredTextRenderListener(
					strategy, filter);
			PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
					listener);
			processor.processContent(PdfReader.getStreamBytes(stream), page
					.getAsDict(PdfName.RESOURCES));
			out.print(SimpleXMLParser.escapeXML(listener.getResultantText(), true));
		}
		// if the identifier is an array, we call the parseTag method
		// recursively
		else if (object instanceof PdfArray) {
			PdfArray arr = (PdfArray) object;
			int n = arr.size();
			for (int i = 0; i < n; i++) {
				parseTag(tag, arr.getPdfObject(i), page);
				if (i < n - 1)
					out.println();
			}
		}
		// if the identifier is a dictionary, we get the resources from the
		// dictionary
		else if (object instanceof PdfDictionary) {
			PdfDictionary mcr = (PdfDictionary) object;
			parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr
					.getAsDict(PdfName.PG));
		}
	}

}