/* Copyright (C) 2003-2011 JabRef contributors.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package net.sf.jabref;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
/**
* This class contains utility method for duplicate checking of entries.
*/
public class DuplicateCheck {
public static double duplicateThreshold = 0.75; // The overall threshold to signal a duplicate pair
// Non-required fields are investigated only if the required fields give a value within
// the doubt range of the threshold:
public static double doubtRange = 0.05;
final static double reqWeight = 3; // Weighting of all required fields
// Extra weighting of those fields that are most likely to provide correct duplicate detection:
static HashMap<String,Double> fieldWeights = new HashMap<String, Double>();
static {
fieldWeights.put("author", 2.5);
fieldWeights.put("editor", 2.5);
fieldWeights.put("title", 3.);
fieldWeights.put("journal", 2.);
}
/**
* Checks if the two entries represent the same publication.
*
* @param one BibtexEntry
* @param two BibtexEntry
* @return boolean
*/
public static boolean isDuplicate(BibtexEntry one, BibtexEntry two) {
// First check if they are of the same type - a necessary condition:
if (one.getType() != two.getType())
return false;
// The check if they have the same required fields:
String[] fields = one.getType().getRequiredFields();
double[] req;
if (fields == null) {
req = new double[] {0., 0.};
}
else
req = compareFieldSet(fields, one, two);
if (Math.abs(req[0] - duplicateThreshold) > doubtRange) {
// Far from the threshold value, so we base our decision on the req. fields only
return req[0] >= duplicateThreshold;
}
else {
// Close to the threshold value, so we take a look at the optional fields, if any:
fields = one.getType().getOptionalFields();
if (fields != null) {
double[] opt = compareFieldSet(fields, one, two);
double totValue = (reqWeight*req[0]*req[1] + opt[0]*opt[1]) / (req[1]*reqWeight+opt[1]);
return totValue >= duplicateThreshold;
} else {
return (req[0] >= duplicateThreshold);
}
}
}
private static double[] compareFieldSet(String[] fields, BibtexEntry one, BibtexEntry two) {
double res = 0;
double totWeights = 0.;
for (int i = 0; i < fields.length; i++) {
// Util.pr(":"+compareSingleField(fields[i], one, two));
double weight;
if (fieldWeights.containsKey(fields[i]))
weight = fieldWeights.get(fields[i]);
else
weight = 1.0;
totWeights += weight;
int result = compareSingleField(fields[i], one, two);
//System.out.println("Field: "+fields[i]+": "+result);
if (result == Util.EQUAL) {
res += weight;
}
else if (result == Util.EMPTY_IN_BOTH)
totWeights -= weight;
}
if (totWeights > 0)
return new double[] {res / totWeights, totWeights};
else // no fields present. This points to a possible duplicate?
return new double[] {0.5, 0.0};
}
private static int compareSingleField(String field, BibtexEntry one, BibtexEntry two) {
String s1 = one.getField(field), s2 = two.getField(field);
if (s1 == null) {
if (s2 == null)
return Util.EMPTY_IN_BOTH;
else
return Util.EMPTY_IN_ONE;
} else if (s2 == null)
return Util.EMPTY_IN_TWO;
// Util.pr(field+": '"+s1+"' vs '"+s2+"'");
if (field.equals("author") || field.equals("editor")) {
// Specific for name fields.
// Harmonise case:
String auth1 = AuthorList.fixAuthor_lastNameOnlyCommas(s1, false).replaceAll(" and ", " ").toLowerCase(),
auth2 = AuthorList.fixAuthor_lastNameOnlyCommas(s2, false).replaceAll(" and ", " ").toLowerCase();
//System.out.println(auth1);
//System.out.println(auth2);
//System.out.println(correlateByWords(auth1, auth2));
double similarity = correlateByWords(auth1, auth2, false);
if (similarity > 0.8)
return Util.EQUAL;
else
return Util.NOT_EQUAL;
} else if (field.equals("pages")) {
// Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ".
// We do a replace to harmonize these to a simple "-":
// After this, a simple test for equality should be enough:
s1 = s1.replaceAll("[- ]+","-");
s2 = s2.replaceAll("[- ]+","-");
if (s1.equals(s2))
return Util.EQUAL;
else
return Util.NOT_EQUAL;
} else if (field.equals("journal")) {
// We do not attempt to harmonize abbreviation state of the journal names,
// but we remove periods from the names in case they are abbreviated with
// and without dots:
s1 = s1.replaceAll("\\.", "").toLowerCase();
s2 = s2.replaceAll("\\.", "").toLowerCase();
//System.out.println(s1+" :: "+s2);
double similarity = correlateByWords(s1, s2, true);
if (similarity > 0.8)
return Util.EQUAL;
else
return Util.NOT_EQUAL;
} else {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
double similarity = correlateByWords(s1, s2, false);
if (similarity > 0.8)
return Util.EQUAL;
else
return Util.NOT_EQUAL;
/*if (s1.trim().equals(s2.trim()))
return Util.EQUAL;
else
return Util.NOT_EQUAL;*/
}
}
public static double compareEntriesStrictly(BibtexEntry one, BibtexEntry two) {
HashSet<String> allFields = new HashSet<String>();// one.getAllFields());
allFields.addAll(one.getAllFields());
allFields.addAll(two.getAllFields());
int score = 0;
for (Iterator<String> fld = allFields.iterator(); fld.hasNext();) {
String field = fld.next();
Object en = one.getField(field), to = two.getField(field);
if ((en != null) && (to != null) && (en.equals(to)))
score++;
else if ((en == null) && (to == null))
score++;
}
if (score == allFields.size())
return 1.01; // Just to make sure we can
// use score>1 without
// trouble.
else
return ((double) score) / allFields.size();
}
/**
* Goes through all entries in the given database, and if at least one of
* them is a duplicate of the given entry, as per
* Util.isDuplicate(BibtexEntry, BibtexEntry), the duplicate is returned.
* The search is terminated when the first duplicate is found.
*
* @param database The database to search.
* @param entry The entry of which we are looking for duplicates.
* @return The first duplicate entry found. null if no duplicates are found.
*/
public static BibtexEntry containsDuplicate(BibtexDatabase database, BibtexEntry entry) {
for (BibtexEntry other : database.getEntries()) {
if (isDuplicate(entry, other))
return other; // Duplicate found.
}
return null; // No duplicate found.
}
/**
* Compare two strings on the basis of word-by-word correlation analysis.
* @param s1 The first string
* @param s2 The second string
* @param truncate if true, always truncate the longer of two words to be compared to
* harmonize their length. If false, use interpolation to harmonize the strings.
* @return a value in the interval [0, 1] indicating the degree of match.
*/
public static double correlateByWords(String s1, String s2, boolean truncate) {
String[] w1 = s1.split("\\s"),
w2 = s2.split("\\s");
int n = Math.min(w1.length, w2.length);
int misses = 0;
for (int i=0; i<n; i++) {
/*if (!w1[i].equalsIgnoreCase(w2[i]))
misses++;*/
double corr = correlateStrings(w1[i], w2[i], truncate);
if (corr < 0.75)
misses++;
}
double missRate = ((double)misses)/((double)n);
return 1-missRate;
}
public static double correlateStrings(String s1, String s2, boolean truncate) {
int minLength = Math.min(s1.length(), s2.length());
if (truncate && minLength == 1) {
return s1.charAt(0) == s2.charAt(0) ? 1.0 : 0.0;
}
else if (s1.length() == 1 && s2.length() == 1) {
return s1.equals(s2) ? 1.0 : 0.0;
}
else if (minLength == 0)
return s1.length() == 0 && s2.length() == 0 ? 1.0 : 0;
// Convert strings to numbers and harmonize length in a method dependent on truncate:
if (truncate) {
// Harmonize length by truncation:
if (s1.length() > minLength)
s1 = s1.substring(0, minLength);
if (s2.length() > minLength)
s2 = s2.substring(0, minLength);
}
double[] n1 = numberizeString(s1),
n2 = numberizeString(s2);
// If truncation is disabled, harmonize length by interpolation:
if (!truncate) {
if (n1.length < n2.length)
n1 = stretchArray(n1, n2.length);
else if (n2.length < n1.length)
n2 = stretchArray(n2, n1.length);
}
return corrCoef(n1, n2);
}
private static double corrCoef(double[] n1, double[] n2) {
// Calculate mean values:
double mean1 = 0, mean2 = 0;
for (int i=0; i<n1.length; i++) {
mean1 += n1[i];
mean2 += n2[i];
}
mean1 /= (double)n1.length;
mean2 /= (double)n2.length;
double sigma1 = 0, sigma2 = 0;
// Calculate correlation coefficient:
double corr = 0;
for (int i=0; i<n1.length; i++) {
sigma1 += (n1[i] - mean1)*(n1[i] - mean1);
sigma2 += (n2[i] - mean2)*(n2[i] - mean2);
corr += (n1[i] - mean1)*(n2[i] - mean2);
}
sigma1 = Math.sqrt(sigma1);
sigma2 = Math.sqrt(sigma2);
if (sigma1 > 0 && sigma2 > 0)
return corr/(sigma1*sigma2);
else
return 0;
}
private static double[] numberizeString(String s) {
double[] res = new double[s.length()];
for (int i=0; i<s.length(); i++)
res[i] = (double)s.charAt(i);
return res;
}
private static double[] stretchArray(double[] array, int length) {
if (length <= array.length || array.length == 0)
return array;
double multip = ((double)array.length)/((double)length);
double[] newArray = new double[length];
for (int i=0; i<newArray.length; i++) {
double index = ((double)i)*multip;
int baseInd = (int)Math.floor(index);
double dist = index - Math.floor(index);
newArray[i] = dist*array[Math.min(array.length-1, baseInd+1)]
+ (1.0 - dist)*array[baseInd];
}
return newArray;
}
public static void main(String[] args) {
String d1 = "Characterization of Calanus finmarchicus habitat in the North Sea",
d2 = "Characterization of Calunus finmarchicus habitat in the North Sea",
d3 = "Characterization of Calanus glacialissss habitat in the South Sea";
System.out.println(correlateByWords(d1, d2, false));
System.out.println(correlateByWords(d1, d3, false));
System.out.println(correlateByWords(d2, d3, false));
}
}