/*
* PDFStructureComparator
*
* Copyright (c) 2012, E&E information consultants AG. All rights reserved.
* Authors:
* Peter Jentsch
* Nico Hezel
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
package de.ee.hezel;
import java.awt.geom.Rectangle2D;
import java.text.DecimalFormat;
import de.ee.hezel.logger.ICompareLogger;
import de.ee.hezel.model.PDFHolder;
import de.ee.hezel.model.PDFInfoHolder;
import de.ee.hezel.model.PDFPageHolder;
import de.ee.hezel.model.PDFInfoHolder.DifferenceType;
import de.ee.hezel.model.pdfelemente.PDFEntryHolder;
import de.ee.hezel.model.pdfelemente.PDFTextHolder;
/**
* Compare the pdf structures and find differences
*
* With this method small changes (approx. > 10% difference per pdf element)
* gets found and saves.
*
* If a "SIMPLE" comparison is desired. The horizontal position
* of the element gets ignored.
*
* @author hezeln
*
*/
public class PDFStructureComparator extends AbstractPDFCompare {
private final boolean isSimpleComparison;
private PDFInfoHolder pdfInfoHolder;
public PDFStructureComparator(boolean isSimpleComparison, PDFInfoHolder pdfih)
{
this.isSimpleComparison = isSimpleComparison;
pdfInfoHolder = pdfih;
}
public PDFStructureComparator(boolean isSimpleComparison, ICompareLogger diffLog, PDFInfoHolder pdfih)
{
this(isSimpleComparison, pdfih);
setDifferenceLogger(diffLog);
}
/**
* Compare the pdf structures and find differences
*
* With this method small changes (approx. > 10% difference per pdf element)
* gets found and saves.
*
* If a "SIMPLE" comparison is desired. The horizontal position
* of the element gets ignored.
*
* @param pdfHolders
*/
public void compare()
{
// already found differences (e.g. different page count)
if(!pdfInfoHolder.isDifferent())
{
PDFHolder pdfHolder1 = pdfInfoHolder.getPDFStructure1();
PDFHolder pdfHolder2 = pdfInfoHolder.getPDFStructure2();
// missing the counter part
if(pdfHolder1 == null || pdfHolder2 == null)
{
pdfInfoHolder.setDifferent( DifferenceType.VISUAL);
return;
}
//compare the structures, in both directions
comparePDFHolder(pdfHolder1, pdfHolder2);
comparePDFHolder(pdfHolder2, pdfHolder1);
// check if a difference was found on one of the pages.
// if so mark the entire pdf as different
pdfInfoHolder.checkDifference();
}
}
/**
* compare the structure of the given pdf documents
*
* @param pdfHolder1
* @param pdfHolder2
*/
private void comparePDFHolder(PDFHolder pdfHolder1, PDFHolder pdfHolder2)
{
for (PDFPageHolder pdfPageHolder1 : pdfHolder1.getPageHolders()) {
// get the same page from the other pdf document
PDFPageHolder pdfPageHolder2 = pdfHolder2.getPageHolder(pdfPageHolder1.getPageNumber());
if (pdfPageHolder2 == null) {
diff.log(pdfInfoHolder.getFilename()+": page " + pdfPageHolder1.getPageNumber() + " missing in other pdf");
return;
}
// run thru all structure elements for this page
for (PDFEntryHolder pdfEntryHolder1 : pdfPageHolder1.getElements()) {
// try to find the same element at the same page of the other pdf
PDFEntryHolder pdfEntryHolder2 = findEntryHolder(pdfEntryHolder1, pdfPageHolder2);
// no valid element found, assume difference
if(pdfEntryHolder2 == null)
{
pdfEntryHolder1.setDifferent(true);
DecimalFormat df = new DecimalFormat( "####.###" );
if(pdfEntryHolder1 instanceof PDFTextHolder)
{
PDFTextHolder th = (PDFTextHolder)pdfEntryHolder1;
diff.log(pdfInfoHolder.getFilename()+": Could not find smiliar text \""+th.getText()+"\" on page "
+ (pdfPageHolder1.getPageNumber()+1) + " at position " + df.format(th.getX())+" | " + df.format(th.getY())
+ " with size " + df.format(th.getWidth()) + " width and " + df.format(th.getHeight()) + " height");
}
else
{
diff.log(pdfInfoHolder.getFilename()+": Could not find smiliar image on page " + (pdfPageHolder1.getPageNumber()+1)
+ " at position " + df.format(pdfEntryHolder1.getX())+" | " + df.format(pdfEntryHolder1.getY()) + " with size "
+ df.format(pdfEntryHolder1.getWidth()) + " width and " + df.format(pdfEntryHolder1.getHeight()) + " height");
}
}
}
}
}
/**
* Try to find an element in the given page which is as similar as
* possible to the given element
*
* similar means:
* - same element type (e.g. image, text)
* - same text (if text element)
* - same size and position
* * for images: both images cover 99% of each other
* * for text: 85-65% (depending on the text)
* * for text and SIMPLE mode: 55-65%
*
* @param pdfEntryHolderSearch
* @param pdfPageHolder
* @return
*/
private PDFEntryHolder findEntryHolder(PDFEntryHolder pdfEntryHolderSearch, PDFPageHolder pdfPageHolder)
{
boolean bestIsMalformed = false;
float bestCoverage = 0f;
PDFEntryHolder bestEntryHolder = null;
for (PDFEntryHolder pdfEntryHolder : pdfPageHolder.getElements())
{
if(pdfEntryHolderSearch.getClass() != pdfEntryHolder.getClass())
continue;
// check if one of the elements is broken
boolean isMalformed = isMalformedTextHolder(pdfEntryHolderSearch, pdfEntryHolder);
// how much does the elements cover each other
float areaCoverage = calcAreaCoverage(pdfEntryHolderSearch, pdfEntryHolder, isMalformed);
if(pdfEntryHolderSearch instanceof PDFTextHolder)
{
String searchText = ((PDFTextHolder)pdfEntryHolderSearch).getText();
String text = ((PDFTextHolder)pdfEntryHolder).getText();
// is the text the same
if(!searchText.equalsIgnoreCase(text))
areaCoverage = 0.0f;
}
// check if the element is cut by the page borders
if(horziontalPositionFromElementInPage(pdfEntryHolder, pdfPageHolder) !=
horziontalPositionFromElementInPage(pdfEntryHolderSearch, pdfPageHolder))
areaCoverage = 0.0f;
// remember the best element
if(bestCoverage < areaCoverage)
{
bestCoverage = areaCoverage;
bestEntryHolder = pdfEntryHolder;
bestIsMalformed = isMalformed;
}
}
// check if the result is within the requirements
if(!hasSufficientSimilarity(bestCoverage, bestEntryHolder, bestIsMalformed))
bestEntryHolder = null;
return bestEntryHolder;
}
/**
* Calculate the horizontal position of the element.
* If the elements hits the border on the lefts side of the page
* -1 gets returned. If the elements is exact in the middle 0 gets
* return and 1 if the elements cuts the border on the right side
*
* @param pdfEntryHolder
* @param pdfPageHolder
* @return
*/
private int horziontalPositionFromElementInPage(PDFEntryHolder pdfEntryHolder, PDFPageHolder pdfPageHolder)
{
if(pdfEntryHolder.getX() < 0)
return -1;
else if(pdfEntryHolder.getX()+pdfEntryHolder.getWidth() > pdfPageHolder.getPageWidth())
return 1;
return 0;
}
/**
* Text element are sometimes broken. There height is 0.
* If the vertical position and the text is the same to the element on the other pdf
* use their height instead.
*
* @param entryHolder1
* @param entryHolder2
* @return
*/
private boolean isMalformedTextHolder(PDFEntryHolder entryHolder1, PDFEntryHolder entryHolder2) {
if (entryHolder1 instanceof PDFTextHolder) {
// if the text is the same, it might be the same element
if (!((PDFTextHolder) entryHolder1).getText().equalsIgnoreCase(((PDFTextHolder) entryHolder2).getText()))
return false;
if ((entryHolder1.getHeight() <= 1 && entryHolder2.getHeight() > 1)
|| (entryHolder2.getHeight() <= 1 && entryHolder1.getHeight() > 1))
return true;
}
return false;
}
/**
* SIMPLE Mode:
* Image should have a coverage of 99%. Text only 65%
* or 55% if the text element was broken.
*
*
* STRUCTURAL Mode:
* Images should have a coverage of 99%. Text which is
* longer then 5 letters need 85%.
* And for every letter less, it gets reduced by 2%.
*
* If the element was broken, another 10% gets reduced.
*
* @param coverage
* @param entryHolder
* @return
*/
private boolean hasSufficientSimilarity(float coverage, PDFEntryHolder entryHolder, boolean isMalformed)
{
float minCoverage = 0.99f;
// text does have difference min coverage depending on their length
if(entryHolder instanceof PDFTextHolder)
{
// SIMPLE mode
if(isSimpleComparison)
{
// broken text element get 10% extra
float reduceCoverage = (isMalformed) ? 0.1f : 0.0f;
minCoverage = 0.65f - reduceCoverage;
}
else
{
// STRUCTURAL mode
String text = ((PDFTextHolder)entryHolder).getText();
// for each char less 5 the coverage gets reduced by 2%
// e.g. 1 letter text = 10% reduction
// 4 letter ext = 2 %
float reduceCoverage = (float)(10 - text.length() * 2) / 100;
// broken text element get 10% extra
if(isMalformed)
reduceCoverage += 0.1f;
if(reduceCoverage < 0)
reduceCoverage = 0;
minCoverage = 0.85f - reduceCoverage;
}
}
return (coverage > minCoverage) ? true : false;
}
/**
* SIMPLE mode
* Calculate how strong the area of the entryholder cover
* each other. Consider only the vertical position and size.
*
* STRUCTURAL mode
* Calculate how strong the area of the entryholder cover
* each other.
*
* @param entryHolder1
* @param entryHolder2
* @return
*/
private float calcAreaCoverage(PDFEntryHolder entryHolder1, PDFEntryHolder entryHolder2, boolean isMalformed)
{
double eh1_height = entryHolder1.getHeight();
double eh2_height = entryHolder2.getHeight();
double eh1_y = entryHolder1.getY();
double eh2_y = entryHolder2.getY();
// repair is possible the height of broken text elements
if(isMalformed)
{
// this is quite optimistic strategy. but if both elements
// lie away from each other, thos additional pixels will not help them
if(entryHolder1.getHeight() <= 1 && entryHolder2.getHeight() > 1)
{
eh1_height = entryHolder2.getHeight();
eh1_y = eh1_y-(eh1_height-entryHolder1.getHeight());
}
else if(entryHolder1.getHeight() > 1 && entryHolder2.getHeight() <= 1)
{
eh2_height = entryHolder1.getHeight();
eh2_y = eh2_y-(eh2_height-entryHolder2.getHeight());
}
}
double coverageRect1 = 0, coverageRect2 = 0;
if(isSimpleComparison)
{
/**
* minimal coverage of the overlapping area in vertical direction
*
* minFromX-----
* --------------- | |
* | | | |
* minToX--------- | |
* |------------
*/
double coveredHeight = 0;
if(eh1_y < eh2_y)
coveredHeight = eh1_height - (eh2_y - eh1_y);
else if(eh1_y > eh2_y)
coveredHeight = eh2_height - (eh1_y - eh2_y);
else if(eh1_y == eh2_y)
coveredHeight = Math.min(eh1_height, eh2_height);
if(coveredHeight <= 0)
return 0.0f;
// how strong is the coverage for each entry holder
coverageRect1 = coveredHeight / eh1_height;
coverageRect2 = coveredHeight / eh2_height;
}
else
{
Rectangle2D rect1 = new Rectangle2D.Double(entryHolder1.getX(), eh1_y, entryHolder1.getWidth(), eh1_height);
Rectangle2D rect2 = new Rectangle2D.Double(entryHolder2.getX(), eh2_y, entryHolder2.getWidth(), eh2_height);
if(!rect1.intersects(rect2))
return 0.0f;
// calc the overlapping area
Rectangle2D result = new Rectangle2D.Float();
Rectangle2D.intersect(rect1, rect2, result);
// compare covered area with original area
double resultArea = result.getWidth()*result.getHeight();
coverageRect1 = resultArea / (rect1.getWidth()*rect1.getHeight());
coverageRect2 = resultArea / (rect2.getWidth()*rect2.getHeight());
}
// We use always the not so good coverage.
// because a small element can lie in a bigger one and
// would have a coverage of 100%
return (float)Math.min(coverageRect1, coverageRect2);
}
}