PdfAnalysis.java example

Explorer

preservation-tools-master
- pdf-tools
  - src
    - main
      - java
        filetools
        ChecksumChecker.java
        GenericFileAnalysis.java
        MetadataExtraction.java
        audio
        AudioCD.java
        AudioFilesConversion.java
        executables
        CdRom_IsoImageChecker.java
        gif
        GifReparator.java
        pdf
        PdfAValidator.java
        PdfAnalysis.java
        PdfChecker.java
        PdfCreationSoftwareDetective.java
        PdfTextExtraction.java
        PdfToImageConverter.java
        PdfTwinTest.java
        XmpMetadataExtractor.java
        iTextRepairPdf.java
        tiff
        TiffFileAnalysis.java
        TiffTagZbw.java
        TiffTestClass.java
        output
        XmlOutput.java
        XslStyleSheets.java
        testinglearning
        HelloComponent.java
        Mouselistener.java
        utilities
        BrowserDialogs.java
        CsvOutput.java
        FolderMethods.java
        HexReader.java
        ListsFiles.java
        SearchCertainFormat.java
        TestClass.java
        TextSucheInOrdner.java
        fileStringUtilities.java
    - test
      - java
        pdfHackerTools
        PdfUtilitiesTest.java

package filetools.pdf;

// TODO: next time, the package name should start with a small character, this is the convention

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Collection;
import java.util.List;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;

public class PdfAnalysis {

	/*******************************************************
	 * Variables and objects used within the whole package
	 ********************************************************/
	public static BufferedReader PdfHeaderTest;

	// static Logger logger = LoggerFactory.getLogger(PdfAnalysis.class);

	/*********************************************************
	 * Methods used within the whole package
	 *
	 ********************************************************/

	/****************************************************************************
	 * Analysis PDF-Objects
	 * 
	 * @param file
	 * @return: nothings, puts out the information in a file
	 * @throws IOException
	 */
	public static void analysePdfObjects(File file) throws IOException {

		PrintWriter pdfboxanalysis = new PrintWriter(new FileWriter("D://pdfboxanalysis.txt"));
		pdfboxanalysis.println(file.toString());

		PDDocument pdf = PDDocument.load(file);
		PDDocumentInformation info = pdf.getDocumentInformation();
		COSDictionary dict = info.getDictionary();
		Collection<COSBase> l = dict.getValues();

		COSArray mediaBox = (COSArray) dict.getDictionaryObject("MediaBox");
		System.out.println("MediaBox: " + mediaBox);

		COSDictionary trailer = pdf.getDocument().getTrailer();
		System.out.println("Trailer:" + trailer);

		if (pdf.isEncrypted()) { // this actually works easily
			System.out.println("Encrypted");
		}

		for (Object o : l) {
			// System.out.println(o.toString());
			pdfboxanalysis.println(o.toString());
		}

		PDDocumentCatalog cat = pdf.getDocumentCatalog();

		@SuppressWarnings("unchecked")
		List<PDPage> lp = cat.getAllPages();
		pdfboxanalysis.println("# Pages: " + lp.size());
		PDPage page = lp.get(4);
		pdfboxanalysis.println("Page: " + page);
		pdfboxanalysis.println("\tCropBox: " + page.getCropBox());
		pdfboxanalysis.println("\tMediaBox: " + page.getMediaBox());
		pdfboxanalysis.println("\tResources: " + page.getResources());
		pdfboxanalysis.println("\tRotation: " + page.getRotation());
		pdfboxanalysis.println("\tArtBox: " + page.getArtBox());
		pdfboxanalysis.println("\tBleedBox: " + page.getBleedBox());
		pdfboxanalysis.println("\tContents: " + page.getContents());
		pdfboxanalysis.println("\tTrimBox: " + page.getTrimBox());
		List<PDAnnotation> la = page.getAnnotations();
		pdfboxanalysis.println("\t# Annotations: " + la.size());

		pdfboxanalysis.close();
	}

	/*********************************************************
	 * Checks if a PDF is ok to work with %PDF Header, Broken PDF & Encryption
	 * 
	 * @param file
	 * @return: boolean true or false
	 * @throws IOException
	 */

	public static boolean testPdfOk(File file) throws IOException {
		if (filetools.GenericFileAnalysis.testFileHeaderPdf(file) == true) {		
			
					PDDocument testfile = PDDocument.load(file);
					if (!testfile.isEncrypted()) {
						if (!checkBrokenPdf(file.toString())) {
							return true;
						} else {
							System.out.println("Broken Pdf");
							return false;
						}
					} else {
						System.out.println("Is encrypted");
						return false;
					}			
		
		} else {
			System.out.println("No PDF Header");
			return false;
		}

	}

	/**
	 * Determines which PDF version it is. Can also detect PDF/A.
	 * 
	 * @param File
	 *            (should be PDF)
	 * @return: String PDF Version TODO: occasionally throws WARN about log4j
	 *          that I cannot understand or get rid of.
	 * @throws IOException
	 */

	public static String checkIfPdfA(File file) throws IOException {
		String pdfType = "No XMP Metadata";
		String XmpMetadata;
		PdfReader reader;
		try {
			reader = new PdfReader(file.toString());
			if (reader.getPdfVersion() > 3) {
				if (reader.getMetadata() != null) {
					XmpMetadata = new String(reader.getMetadata()); // nullpointerException
					reader.close();
					if (XmpMetadata.contains("pdfaid:conformance")) {
						pdfType = "PDF/A";
					} else {
						pdfType = "PDF 1.4 or higher";
					}
				}
			} else {
				pdfType = "PDF 1.0 - 1.3";
			}
			return pdfType;
		} catch (java.lang.NullPointerException e) {
			System.out.println(e);
			pdfType = "PDF cannot be read by PdfReader";
			// logger.error("Error analyzing " + e);
			return pdfType;
		}
	}

	/**
	 * Checks if a Pdf is too broken to be examined.
	 * 
	 * @param File
	 *            (should be PDF)
	 * @return: boolean
	 * @throws IOException
	 */

	// TODO: This function does not work, e. g. for encrypted files and should
	// not be used until it is fixed.
	public static boolean checkBrokenPdf(String file) throws IOException {

		boolean brokenPdf;
		try {
			PdfReader reader = new PdfReader(file);
			reader.getMetadata();
			// TODO: One day this function could test more and be more clever.
			brokenPdf = false;
			return brokenPdf;
		} catch (Exception e) {
			System.out.println("Broken: " + file);
			brokenPdf = true;
			// logger.error("Error analyzing " + e);
			return brokenPdf;
		}
	}

	/**
	 * Simple Encryption Test without reader, because encryption causes lots of
	 * exceptions.
	 * 
	 * @param PDDocument
	 *            (should be PDF)
	 * @return: boolean
	 * @throws IOException
	 */

	public static boolean testsEncryption(PDDocument file) throws IOException {
		// PDDocumentInformation info =
		// PDDocument.load(file).getDocumentInformation();
		if (file.isEncrypted() == true) {
			System.out.println(file + " is encrypted");
			return true;
		} else {
			return false;
		}
	}

	public static String[] extractsPdfLines(String PdfFile) throws IOException {
		try {
			StringBuffer buff = new StringBuffer();
			String ExtractedText = null;
			PdfReader reader = new PdfReader(PdfFile);
			PdfReaderContentParser parser = new PdfReaderContentParser(reader);
			TextExtractionStrategy strategy;

			for (int i = 1; i <= reader.getNumberOfPages(); i++) {
				strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
				ExtractedText = strategy.getResultantText().toString();
				buff.append(ExtractedText + "\n");
			}

			String[] LinesArray;
			LinesArray = buff.toString().split("\n");
			reader.close();
			return LinesArray;
		} catch (Exception e) {
			return null;
		}
	}

	public static int getPdfVersion(String pdffile) throws IOException {
		BufferedReader fileReader = new BufferedReader(new FileReader(pdffile));
		String fileHeader = fileReader.readLine();
		fileReader.close();

		if (fileHeader.contains("%PDF-1.2")) {
			return 2;
		} else if (fileHeader.contains("%PDF-1.3")) {
			return 3;
		} else if (fileHeader.contains("%PDF-1.4")) {
			return 4;
		} else if (fileHeader.contains("%PDF-1.5")) {
			return 5;
		} else if (fileHeader.contains("%PDF-1.6")) {
			return 6;
		} else if (fileHeader.contains("%PDF-1.7")) {
			return 7;
		} else {
			return 7;
		}
	}

	/**
	 * Checks the size of the Pdf-file, because some big Pdf Files might cause
	 * exceptions. *
	 * 
	 * @param file
	 *            (should be Pdf)
	 * @return: boolean
	 * @throws
	 */

	/*
	 * I think this method is so complicated because of the test that was build.
	 * Maybe change method in GenericFileAnalysis eventually to embedd those
	 * kinds of tests, too.
	 * 
	 * public static boolean checkPdfSize(File file) { boolean toobig =
	 * isFileTooLong(file, DEFAULT_MAX_FILE_LENGTH); if (toobig) { System.out
	 * .println("File is bigger than 16 MB and therefore cannot be measured"); }
	 * return toobig; }
	 * 
	 * public static boolean checkPdfSize(String filePath) { File toCheck = new
	 * File (filePath); return checkPdfSize(toCheck); }
	 * 
	 * public static boolean isFileTooLong(File toCheck, long maxLength) {
	 * return (toCheck.length() > maxLength); }
	 */
}