package org.jabref.logic.bibtex; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Set; import org.jabref.logic.util.strings.StringSimilarity; import org.jabref.model.EntryTypes; import org.jabref.model.database.BibDatabase; import org.jabref.model.database.BibDatabaseMode; import org.jabref.model.entry.AuthorList; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.EntryType; import org.jabref.model.entry.FieldName; import org.jabref.model.entry.FieldProperty; import org.jabref.model.entry.InternalBibtexFields; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * This class contains utility method for duplicate checking of entries. */ public class DuplicateCheck { public static double duplicateThreshold = 0.75; // The overall threshold to signal a duplicate pair private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class); /* * Integer values for indicating result of duplicate check (for entries): */ private static final int NOT_EQUAL = 0; private static final int EQUAL = 1; private static final int EMPTY_IN_ONE = 2; private static final int EMPTY_IN_TWO = 3; private static final int EMPTY_IN_BOTH = 4; // Non-required fields are investigated only if the required fields give a value within // the doubt range of the threshold: private static final double DOUBT_RANGE = 0.05; private static final double REQUIRED_WEIGHT = 3; // Weighting of all required fields // Extra weighting of those fields that are most likely to provide correct duplicate detection: private static final Map<String, Double> FIELD_WEIGHTS = new HashMap<>(); static { DuplicateCheck.FIELD_WEIGHTS.put(FieldName.AUTHOR, 2.5); DuplicateCheck.FIELD_WEIGHTS.put(FieldName.EDITOR, 2.5); DuplicateCheck.FIELD_WEIGHTS.put(FieldName.TITLE, 3.); DuplicateCheck.FIELD_WEIGHTS.put(FieldName.JOURNAL, 2.); } private DuplicateCheck() { } /** * Checks if the two entries represent the same publication. * * @param one BibEntry * @param two BibEntry * @return boolean */ public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) { // same identifier if (hasSameIdentifier(one, two)) { return true; } // same entry type if (!one.getType().equals(two.getType())) { return false; } EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode); // The check if they have the same required fields: List<String> var = type.getRequiredFieldsFlat(); double[] req; if (var == null) { req = new double[]{0., 0.}; } else { req = DuplicateCheck.compareFieldSet(var, one, two); } if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) { // Far from the threshold value, so we base our decision on the req. fields only return req[0] >= DuplicateCheck.duplicateThreshold; } // Close to the threshold value, so we take a look at the optional fields, if any: List<String> optionalFields = type.getOptionalFields(); if (optionalFields != null) { double[] opt = DuplicateCheck.compareFieldSet(optionalFields, one, two); double totValue = ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1])) / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]); return totValue >= DuplicateCheck.duplicateThreshold; } return req[0] >= DuplicateCheck.duplicateThreshold; } private static boolean hasSameIdentifier(BibEntry one, BibEntry two) { for (String name : FieldName.getIdentifierFieldNames()) { if (one.getField(name).isPresent() && one.getField(name).equals(two.getField(name))) { return true; } } return false; } private static double[] compareFieldSet(List<String> fields, BibEntry one, BibEntry two) { double res = 0; double totWeights = 0.; for (String field : fields) { double weight; if (DuplicateCheck.FIELD_WEIGHTS.containsKey(field)) { weight = DuplicateCheck.FIELD_WEIGHTS.get(field); } else { weight = 1.0; } totWeights += weight; int result = DuplicateCheck.compareSingleField(field, one, two); if (result == EQUAL) { res += weight; } else if (result == EMPTY_IN_BOTH) { totWeights -= weight; } } if (totWeights > 0) { return new double[]{res / totWeights, totWeights}; } return new double[] {0.5, 0.0}; } private static int compareSingleField(String field, BibEntry one, BibEntry two) { Optional<String> optionalStringOne = one.getField(field); Optional<String> optionalStringTwo = two.getField(field); if (!optionalStringOne.isPresent()) { if (!optionalStringTwo.isPresent()) { return EMPTY_IN_BOTH; } return EMPTY_IN_ONE; } else if (!optionalStringTwo.isPresent()) { return EMPTY_IN_TWO; } // Both strings present String stringOne = optionalStringOne.get(); String stringTwo = optionalStringTwo.get(); if (InternalBibtexFields.getFieldProperties(field).contains(FieldProperty.PERSON_NAMES)) { // Specific for name fields. // Harmonise case: String authorOne = AuthorList.fixAuthorLastNameOnlyCommas(stringOne, false).replace(" and ", " ").toLowerCase(Locale.ROOT); String authorTwo = AuthorList.fixAuthorLastNameOnlyCommas(stringTwo, false).replace(" and ", " ").toLowerCase(Locale.ROOT); double similarity = DuplicateCheck.correlateByWords(authorOne, authorTwo); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } else if (FieldName.PAGES.equals(field)) { // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ". // We do a replace to harmonize these to a simple "-": // After this, a simple test for equality should be enough: stringOne = stringOne.replaceAll("[- ]+", "-"); stringTwo = stringTwo.replaceAll("[- ]+", "-"); if (stringOne.equals(stringTwo)) { return EQUAL; } return NOT_EQUAL; } else if (FieldName.JOURNAL.equals(field)) { // We do not attempt to harmonize abbreviation state of the journal names, // but we remove periods from the names in case they are abbreviated with // and without dots: stringOne = stringOne.replace(".", "").toLowerCase(Locale.ROOT); stringTwo = stringTwo.replace(".", "").toLowerCase(Locale.ROOT); double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } else { stringOne = stringOne.toLowerCase(Locale.ROOT); stringTwo = stringTwo.toLowerCase(Locale.ROOT); double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } } public static double compareEntriesStrictly(BibEntry one, BibEntry two) { Set<String> allFields = new HashSet<>(); allFields.addAll(one.getFieldNames()); allFields.addAll(two.getFieldNames()); int score = 0; for (String field : allFields) { Optional<String> stringOne = one.getField(field); Optional<String> stringTwo = two.getField(field); if (stringOne.equals(stringTwo)) { score++; } } if (score == allFields.size()) { return 1.01; // Just to make sure we can // use score>1 without // trouble. } return (double) score / allFields.size(); } /** * Goes through all entries in the given database, and if at least one of * them is a duplicate of the given entry, as per * Util.isDuplicate(BibEntry, BibEntry), the duplicate is returned. * The search is terminated when the first duplicate is found. * * @param database The database to search. * @param entry The entry of which we are looking for duplicates. * @return The first duplicate entry found. null if no duplicates are found. */ public static Optional<BibEntry> containsDuplicate(BibDatabase database, BibEntry entry, BibDatabaseMode bibDatabaseMode) { for (BibEntry other : database.getEntries()) { if (DuplicateCheck.isDuplicate(entry, other, bibDatabaseMode)) { return Optional.of(other); // Duplicate found. } } return Optional.empty(); // No duplicate found. } /** * Compare two strings on the basis of word-by-word correlation analysis. * * @param s1 The first string * @param s2 The second string * @return a value in the interval [0, 1] indicating the degree of match. */ public static double correlateByWords(String s1, String s2) { String[] w1 = s1.split("\\s"); String[] w2 = s2.split("\\s"); int n = Math.min(w1.length, w2.length); int misses = 0; for (int i = 0; i < n; i++) { double corr = similarity(w1[i], w2[i]); if (corr < 0.75) { misses++; } } double missRate = (double) misses / (double) n; return 1 - missRate; } /** * Calculates the similarity (a number within 0 and 1) between two strings. * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java */ private static double similarity(String s1, String s2) { String longer = s1; String shorter = s2; if (s1.length() < s2.length()) { longer = s2; shorter = s1; } int longerLength = longer.length(); // both strings are zero length if (longerLength == 0) { return 1.0; } double sim = (longerLength - new StringSimilarity().editDistanceIgnoreCase(longer, shorter)) / (double) longerLength; LOGGER.debug("Longer string: " + longer + " Shorter string: " + shorter + " Similarity: " + sim); return sim; } }