package filetools.pdf; // TODO: next time, the package name should start with a small character, this is the convention import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.Collection; import java.util.List; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; public class PdfAnalysis { /******************************************************* * Variables and objects used within the whole package ********************************************************/ public static BufferedReader PdfHeaderTest; // static Logger logger = LoggerFactory.getLogger(PdfAnalysis.class); /********************************************************* * Methods used within the whole package * ********************************************************/ /**************************************************************************** * Analysis PDF-Objects * * @param file * @return: nothings, puts out the information in a file * @throws IOException */ public static void analysePdfObjects(File file) throws IOException { PrintWriter pdfboxanalysis = new PrintWriter(new FileWriter("D://pdfboxanalysis.txt")); pdfboxanalysis.println(file.toString()); PDDocument pdf = PDDocument.load(file); PDDocumentInformation info = pdf.getDocumentInformation(); COSDictionary dict = info.getDictionary(); Collection<COSBase> l = dict.getValues(); COSArray mediaBox = (COSArray) dict.getDictionaryObject("MediaBox"); System.out.println("MediaBox: " + mediaBox); COSDictionary trailer = pdf.getDocument().getTrailer(); System.out.println("Trailer:" + trailer); if (pdf.isEncrypted()) { // this actually works easily System.out.println("Encrypted"); } for (Object o : l) { // System.out.println(o.toString()); pdfboxanalysis.println(o.toString()); } PDDocumentCatalog cat = pdf.getDocumentCatalog(); @SuppressWarnings("unchecked") List<PDPage> lp = cat.getAllPages(); pdfboxanalysis.println("# Pages: " + lp.size()); PDPage page = lp.get(4); pdfboxanalysis.println("Page: " + page); pdfboxanalysis.println("\tCropBox: " + page.getCropBox()); pdfboxanalysis.println("\tMediaBox: " + page.getMediaBox()); pdfboxanalysis.println("\tResources: " + page.getResources()); pdfboxanalysis.println("\tRotation: " + page.getRotation()); pdfboxanalysis.println("\tArtBox: " + page.getArtBox()); pdfboxanalysis.println("\tBleedBox: " + page.getBleedBox()); pdfboxanalysis.println("\tContents: " + page.getContents()); pdfboxanalysis.println("\tTrimBox: " + page.getTrimBox()); List<PDAnnotation> la = page.getAnnotations(); pdfboxanalysis.println("\t# Annotations: " + la.size()); pdfboxanalysis.close(); } /********************************************************* * Checks if a PDF is ok to work with %PDF Header, Broken PDF & Encryption * * @param file * @return: boolean true or false * @throws IOException */ public static boolean testPdfOk(File file) throws IOException { if (filetools.GenericFileAnalysis.testFileHeaderPdf(file) == true) { PDDocument testfile = PDDocument.load(file); if (!testfile.isEncrypted()) { if (!checkBrokenPdf(file.toString())) { return true; } else { System.out.println("Broken Pdf"); return false; } } else { System.out.println("Is encrypted"); return false; } } else { System.out.println("No PDF Header"); return false; } } /** * Determines which PDF version it is. Can also detect PDF/A. * * @param File * (should be PDF) * @return: String PDF Version TODO: occasionally throws WARN about log4j * that I cannot understand or get rid of. * @throws IOException */ public static String checkIfPdfA(File file) throws IOException { String pdfType = "No XMP Metadata"; String XmpMetadata; PdfReader reader; try { reader = new PdfReader(file.toString()); if (reader.getPdfVersion() > 3) { if (reader.getMetadata() != null) { XmpMetadata = new String(reader.getMetadata()); // nullpointerException reader.close(); if (XmpMetadata.contains("pdfaid:conformance")) { pdfType = "PDF/A"; } else { pdfType = "PDF 1.4 or higher"; } } } else { pdfType = "PDF 1.0 - 1.3"; } return pdfType; } catch (java.lang.NullPointerException e) { System.out.println(e); pdfType = "PDF cannot be read by PdfReader"; // logger.error("Error analyzing " + e); return pdfType; } } /** * Checks if a Pdf is too broken to be examined. * * @param File * (should be PDF) * @return: boolean * @throws IOException */ // TODO: This function does not work, e. g. for encrypted files and should // not be used until it is fixed. public static boolean checkBrokenPdf(String file) throws IOException { boolean brokenPdf; try { PdfReader reader = new PdfReader(file); reader.getMetadata(); // TODO: One day this function could test more and be more clever. brokenPdf = false; return brokenPdf; } catch (Exception e) { System.out.println("Broken: " + file); brokenPdf = true; // logger.error("Error analyzing " + e); return brokenPdf; } } /** * Simple Encryption Test without reader, because encryption causes lots of * exceptions. * * @param PDDocument * (should be PDF) * @return: boolean * @throws IOException */ public static boolean testsEncryption(PDDocument file) throws IOException { // PDDocumentInformation info = // PDDocument.load(file).getDocumentInformation(); if (file.isEncrypted() == true) { System.out.println(file + " is encrypted"); return true; } else { return false; } } public static String[] extractsPdfLines(String PdfFile) throws IOException { try { StringBuffer buff = new StringBuffer(); String ExtractedText = null; PdfReader reader = new PdfReader(PdfFile); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); ExtractedText = strategy.getResultantText().toString(); buff.append(ExtractedText + "\n"); } String[] LinesArray; LinesArray = buff.toString().split("\n"); reader.close(); return LinesArray; } catch (Exception e) { return null; } } public static int getPdfVersion(String pdffile) throws IOException { BufferedReader fileReader = new BufferedReader(new FileReader(pdffile)); String fileHeader = fileReader.readLine(); fileReader.close(); if (fileHeader.contains("%PDF-1.2")) { return 2; } else if (fileHeader.contains("%PDF-1.3")) { return 3; } else if (fileHeader.contains("%PDF-1.4")) { return 4; } else if (fileHeader.contains("%PDF-1.5")) { return 5; } else if (fileHeader.contains("%PDF-1.6")) { return 6; } else if (fileHeader.contains("%PDF-1.7")) { return 7; } else { return 7; } } /** * Checks the size of the Pdf-file, because some big Pdf Files might cause * exceptions. * * * @param file * (should be Pdf) * @return: boolean * @throws */ /* * I think this method is so complicated because of the test that was build. * Maybe change method in GenericFileAnalysis eventually to embedd those * kinds of tests, too. * * public static boolean checkPdfSize(File file) { boolean toobig = * isFileTooLong(file, DEFAULT_MAX_FILE_LENGTH); if (toobig) { System.out * .println("File is bigger than 16 MB and therefore cannot be measured"); } * return toobig; } * * public static boolean checkPdfSize(String filePath) { File toCheck = new * File (filePath); return checkPdfSize(toCheck); } * * public static boolean isFileTooLong(File toCheck, long maxLength) { * return (toCheck.length() > maxLength); } */ }