/* * * This file is part of the iText (R) project. Copyright (c) 1998-2017 iText Group NV * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License version 3 * as published by the Free Software Foundation with the addition of the * following permission added to Section 15 as permitted in Section 7(a): * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY * ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT * OF THIRD PARTY RIGHTS * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Affero General Public License for more details. * You should have received a copy of the GNU Affero General Public License * along with this program; if not, see http://www.gnu.org/licenses or write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA, 02110-1301 USA, or download the license from the following URL: * http://itextpdf.com/terms-of-use/ * * The interactive user interfaces in modified source and object code versions * of this program must display Appropriate Legal Notices, as required under * Section 5 of the GNU Affero General Public License. * * In accordance with Section 7(b) of the GNU Affero General Public License, * a covered work must retain the producer line in every PDF that is created * or manipulated using iText. * * You can be released from the requirements of the license by purchasing * a commercial license. Buying such a license is mandatory as soon as you * develop commercial activities involving the iText software without * disclosing the source code of your own applications. * These activities include: offering paid services to customers as an ASP, * serving PDFs on the fly in a web application, shipping iText with a closed * source product. * * For more information, please contact iText Software Corp. at this * address: sales@itextpdf.com */ package com.itextpdf.text.pdf.parser; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; import com.itextpdf.text.pdf.PdfDictionary; import com.itextpdf.text.pdf.PdfName; import com.itextpdf.text.pdf.PdfObject; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.PdfStream; import com.itextpdf.text.pdf.RandomAccessFileOrArray; /** * Tool that parses the content of a PDF document. * @since 2.1.4 */ public class PdfContentReaderTool { /** * Shows the detail of a dictionary. * This is similar to the PdfLister functionality. * @param dic the dictionary of which you want the detail * @return a String representation of the dictionary */ static public String getDictionaryDetail(PdfDictionary dic){ return getDictionaryDetail(dic, 0); } /** * Shows the detail of a dictionary. * @param dic the dictionary of which you want the detail * @param depth the depth of the current dictionary (for nested dictionaries) * @return a String representation of the dictionary */ static public String getDictionaryDetail(PdfDictionary dic, int depth){ StringBuffer builder = new StringBuffer(); builder.append('('); List<PdfName> subDictionaries = new ArrayList<PdfName>(); for (PdfName key: dic.getKeys()) { PdfObject val = dic.getDirectObject(key); if (val.isDictionary()) subDictionaries.add(key); builder.append(key); builder.append('='); builder.append(val); builder.append(", "); } if (builder.length() >= 2) builder.setLength(builder.length()-2); builder.append(')'); for (PdfName pdfSubDictionaryName: subDictionaries) { builder.append('\n'); for(int i = 0; i < depth+1; i++){ builder.append('\t'); } builder.append("Subdictionary "); builder.append(pdfSubDictionaryName); builder.append(" = "); builder.append(getDictionaryDetail(dic.getAsDict(pdfSubDictionaryName), depth+1)); } return builder.toString(); } /** * Displays a summary of the entries in the XObject dictionary for the stream * @param resourceDic the resource dictionary for the stream * @return a string with the summary of the entries * @throws IOException * @since 5.0.2 */ static public String getXObjectDetail(PdfDictionary resourceDic) throws IOException { StringBuilder sb = new StringBuilder(); PdfDictionary xobjects = resourceDic.getAsDict(PdfName.XOBJECT); if (xobjects == null) return "No XObjects"; for (PdfName entryName : xobjects.getKeys()) { PdfStream xobjectStream = xobjects.getAsStream(entryName); sb.append("------ " + entryName + " - subtype = " + xobjectStream.get(PdfName.SUBTYPE) + " = " + xobjectStream.getAsNumber(PdfName.LENGTH) + " bytes ------\n"); if (!xobjectStream.get(PdfName.SUBTYPE).equals(PdfName.IMAGE)){ byte[] contentBytes = ContentByteUtils.getContentBytesFromContentObject(xobjectStream); InputStream is = new ByteArrayInputStream(contentBytes); int ch; while ((ch = is.read()) != -1){ sb.append((char)ch); } sb.append("------ " + entryName + " - subtype = " + xobjectStream.get(PdfName.SUBTYPE) + "End of Content" + "------\n"); } } return sb.toString(); } /** * Writes information about a specific page from PdfReader to the specified output stream. * @since 2.1.5 * @param reader the PdfReader to read the page content from * @param pageNum the page number to read * @param out the output stream to send the content to * @throws IOException */ static public void listContentStreamForPage(PdfReader reader, int pageNum, PrintWriter out) throws IOException { out.println("==============Page " + pageNum + "===================="); out.println("- - - - - Dictionary - - - - - -"); PdfDictionary pageDictionary = reader.getPageN(pageNum); out.println(getDictionaryDetail(pageDictionary)); out.println("- - - - - XObject Summary - - - - - -"); out.println(getXObjectDetail(pageDictionary.getAsDict(PdfName.RESOURCES))); out.println("- - - - - Content Stream - - - - - -"); RandomAccessFileOrArray f = reader.getSafeFile(); byte[] contentBytes = reader.getPageContent(pageNum, f); f.close(); out.flush(); InputStream is = new ByteArrayInputStream(contentBytes); int ch; while ((ch = is.read()) != -1){ out.print((char)ch); } out.flush(); out.println("- - - - - Text Extraction - - - - - -"); String extractedText = PdfTextExtractor.getTextFromPage(reader, pageNum, new LocationTextExtractionStrategy()); if (extractedText.length() != 0) out.println(extractedText); else out.println("No text found on page " + pageNum); out.println(); } /** * Writes information about each page in a PDF file to the specified output stream. * @since 2.1.5 * @param pdfFile a File instance referring to a PDF file * @param out the output stream to send the content to * @throws IOException */ static public void listContentStream(File pdfFile, PrintWriter out) throws IOException { PdfReader reader = new PdfReader(pdfFile.getCanonicalPath()); int maxPageNum = reader.getNumberOfPages(); for (int pageNum = 1; pageNum <= maxPageNum; pageNum++){ listContentStreamForPage(reader, pageNum, out); } } /** * Writes information about the specified page in a PDF file to the specified output stream. * @since 2.1.5 * @param pdfFile a File instance referring to a PDF file * @param pageNum the page number to read * @param out the output stream to send the content to * @throws IOException */ static public void listContentStream(File pdfFile, int pageNum, PrintWriter out) throws IOException { PdfReader reader = new PdfReader(pdfFile.getCanonicalPath()); listContentStreamForPage(reader, pageNum, out); } /** * Writes information about each page in a PDF file to the specified file, or System.out. * @param args */ public static void main(String[] args) { try{ if (args.length < 1 || args.length > 3){ System.out.println("Usage: PdfContentReaderTool <pdf file> [<output file>|stdout] [<page num>]"); return; } PrintWriter writer = new PrintWriter(System.out); if (args.length >= 2){ if (args[1].compareToIgnoreCase("stdout") != 0){ System.out.println("Writing PDF content to " + args[1]); writer = new PrintWriter(new FileOutputStream(new File(args[1]))); } } int pageNum = -1; if (args.length >= 3){ pageNum = Integer.parseInt(args[2]); } if (pageNum == -1){ listContentStream(new File(args[0]), writer); } else { listContentStream(new File(args[0]), pageNum, writer); } writer.flush(); if (args.length >= 2){ writer.close(); System.out.println("Finished writing content to " + args[1]); } } catch (Exception e){ e.printStackTrace(System.err); } } }