/* * PDFComparator * * Copyright (c) 2012, E&E information consultants AG. All rights reserved. * Authors: * Peter Jentsch * Nico Hezel * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301 USA */ package de.ee.hezel; import java.io.File; import java.util.Set; import org.apache.log4j.Logger; import de.ee.hezel.logger.DifferenceLogger; import de.ee.hezel.model.PDFInfoHolder; import de.ee.hezel.model.PDFInfoHolder.DifferenceType; import de.ee.hezel.model.PDFPageHolder; import de.ee.hezel.util.AbstractJob; import de.ee.hezel.util.Job; import de.ee.hezel.util.JobListener; import de.ee.hezel.util.JobQueue; /** * @author hezeln * */ public class PDFComparator implements JobListener { static Logger log = Logger.getLogger(PDFComparator.class.getName()); static final int PARALLEL_JOBS = Integer.getInteger(PDFComparator.class.getName() + ".PARALLEL_JOBS", 4); private File logPath; private final int compareType; private boolean foundDifference; public PDFComparator(File logPath, int compareType) { this.compareType = compareType; this.logPath = logPath; } /** * parallelized the comparison tasks * * @param path1 * @param path2 * @param outputDir * @param prefix * @return */ public boolean run(File path1, File path2, File outputDir, String prefix) { long start = System.currentTimeMillis(); // is any pdf pair different foundDifference = false; // get all pdf pairs Set<PDFInfoHolder> pdfInfoHolders = PDFCorpusAnalyser.getSimplePDFInfoHolders(path1, path2, prefix); // start 4 different tasks at the same time JobQueue queue = new JobQueue(PARALLEL_JOBS); for (PDFInfoHolder pdfInfoHolder : pdfInfoHolders) { CompareJob compareJob = new CompareJob(outputDir, pdfInfoHolder); compareJob.addJobListener(this); queue.addJob(compareJob); } // waiting for all jobs to be finished while (queue.hasJobs()) { try { synchronized (queue) { queue.wait(); } } catch (InterruptedException e) { log.error("Interrupted while waiting for jobs to finish", e); } } long end = System.currentTimeMillis(); log.info("Execution time: "+ (end-start)+"ms"); return foundDifference; } @Override public void finished(Job job) { foundDifference |= ((CompareJob) job).hasDifference(); } private class CompareJob extends AbstractJob { private DifferenceLogger dlog; private File outputDir; private boolean foundDifference; private PDFInfoHolder pdfInfoHolder; private PDFVisualComparator pdfVisualComparator; private PDFVisualiseDifference pdfVisualiseDifference; private PDFStructureComparator pdfStructureComparator; private PDFCorpusAnalyser pdfCorpusAnaliser; CompareJob(File outputDir, PDFInfoHolder pdfInfoHolder) { this.outputDir = outputDir; this.pdfInfoHolder = pdfInfoHolder; // each jobs need its own result logger // and logger for the difference-images dlog = new DifferenceLogger(logPath, pdfInfoHolder.getFilename()); pdfCorpusAnaliser = new PDFCorpusAnalyser(dlog); pdfVisualComparator = new PDFVisualComparator(outputDir, dlog, pdfInfoHolder); pdfVisualiseDifference = new PDFVisualiseDifference(outputDir, dlog, pdfInfoHolder); pdfStructureComparator = new PDFStructureComparator((compareType == 1), dlog, pdfInfoHolder); } /** * Start the analyze process. * Find and mark the differences. * Output them in a difference-image * @throws Exception */ @Override protected void executeJobAction() { long start1 = System.currentTimeMillis(); try { // load the pdf file content pdfInfoHolder.loadPDFFiles(); log.info(pdfInfoHolder.getFilename()+": Process "+pdfInfoHolder.getFilename()+".pdf"); // Analyze the content of the pdf document log.info(pdfInfoHolder.getFilename()+": analyse PDF structure ..."); pdfCorpusAnaliser.analyse(pdfInfoHolder); // compare SIMPLE or STRUCTURAL log.info(pdfInfoHolder.getFilename()+": compare PDF structure ..."); pdfStructureComparator.compare(); // compare VISUAL and print the result or simply display the already found differences log.info(pdfInfoHolder.getFilename()+": visualise differences ..."); if(compareType == 3) pdfVisualComparator.compare(); else pdfVisualiseDifference.visualise(); // print the results printResult(pdfInfoHolder); // found differences!? foundDifference = pdfInfoHolder.isDifferent(); } catch (Exception e) { log.error(pdfInfoHolder.getFilename()+":"+e.getMessage(), e); } // calc time needed long end1 = System.currentTimeMillis(); log.info(pdfInfoHolder.getFilename()+": processing took "+ (end1-start1)+"ms"); log.info(""); // release all resources releaseResources(); } public boolean hasDifference() { return foundDifference; } public void releaseResources() { if(dlog != null) dlog.releaseResources(); // are not needed anymore dlog = null; pdfInfoHolder.releasePDFFiles(); pdfInfoHolder = null; pdfCorpusAnaliser = null; pdfVisualComparator = null; pdfVisualiseDifference = null; pdfStructureComparator = null; } } private void printResult(PDFInfoHolder pdfInfoHolder) { if(pdfInfoHolder.isDifferent() && pdfInfoHolder.getDifferent() != DifferenceType.MISSINGDOCUMENT) { String differencesOnPage = ""; for (PDFPageHolder pdfPageHolder : pdfInfoHolder.getPDFStructure1().getPageHolders()) { // get counter part pdf to check for differences PDFPageHolder pdfPageHolder2 = pdfInfoHolder.getPDFStructure2().getPageHolder(pdfPageHolder.getPageNumber()); if(pdfPageHolder.isDifferent() || (pdfPageHolder2 != null && pdfPageHolder2.isDifferent())) differencesOnPage += (pdfPageHolder.getPageNumber()+1)+","; } log.info(pdfInfoHolder.getFilename()+": found differences on pages: "+differencesOnPage); } else log.info(pdfInfoHolder.getFilename()+": no differences found"); } }