DuplicateCheck.java example

Explorer
jabref-master
- src
package org.jabref.logic.bibtex;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

import org.jabref.logic.util.strings.StringSimilarity;
import org.jabref.model.EntryTypes;
import org.jabref.model.database.BibDatabase;
import org.jabref.model.database.BibDatabaseMode;
import org.jabref.model.entry.AuthorList;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.EntryType;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.FieldProperty;
import org.jabref.model.entry.InternalBibtexFields;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * This class contains utility method for duplicate checking of entries.
 */
public class DuplicateCheck {
    public static double duplicateThreshold = 0.75; // The overall threshold to signal a duplicate pair

    private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class);
    /*
     * Integer values for indicating result of duplicate check (for entries):
     */
    private static final int NOT_EQUAL = 0;
    private static final int EQUAL = 1;
    private static final int EMPTY_IN_ONE = 2;
    private static final int EMPTY_IN_TWO = 3;

    private static final int EMPTY_IN_BOTH = 4;
    // Non-required fields are investigated only if the required fields give a value within
    // the doubt range of the threshold:
    private static final double DOUBT_RANGE = 0.05;

    private static final double REQUIRED_WEIGHT = 3; // Weighting of all required fields

    // Extra weighting of those fields that are most likely to provide correct duplicate detection:
    private static final Map<String, Double> FIELD_WEIGHTS = new HashMap<>();

    static {
        DuplicateCheck.FIELD_WEIGHTS.put(FieldName.AUTHOR, 2.5);
        DuplicateCheck.FIELD_WEIGHTS.put(FieldName.EDITOR, 2.5);
        DuplicateCheck.FIELD_WEIGHTS.put(FieldName.TITLE, 3.);
        DuplicateCheck.FIELD_WEIGHTS.put(FieldName.JOURNAL, 2.);
    }

    private DuplicateCheck() { }

    /**
     * Checks if the two entries represent the same publication.
     *
     * @param one BibEntry
     * @param two BibEntry
     * @return boolean
     */
    public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) {
        // same identifier
        if (hasSameIdentifier(one, two)) {
            return true;
        }

        // same entry type
        if (!one.getType().equals(two.getType())) {
            return false;
        }

        EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode);
        // The check if they have the same required fields:
        List<String> var = type.getRequiredFieldsFlat();
        double[] req;
        if (var == null) {
            req = new double[]{0., 0.};
        } else {
            req = DuplicateCheck.compareFieldSet(var, one, two);
        }

        if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) {
            // Far from the threshold value, so we base our decision on the req. fields only
            return req[0] >= DuplicateCheck.duplicateThreshold;
        }
        // Close to the threshold value, so we take a look at the optional fields, if any:
        List<String> optionalFields = type.getOptionalFields();
        if (optionalFields != null) {
            double[] opt = DuplicateCheck.compareFieldSet(optionalFields, one, two);
            double totValue = ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1])) / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]);
            return totValue >= DuplicateCheck.duplicateThreshold;
        }
        return req[0] >= DuplicateCheck.duplicateThreshold;
    }

    private static boolean hasSameIdentifier(BibEntry one, BibEntry two) {
        for (String name : FieldName.getIdentifierFieldNames()) {
            if (one.getField(name).isPresent() && one.getField(name).equals(two.getField(name))) {
                return true;
            }
        }
        return false;
    }

    private static double[] compareFieldSet(List<String> fields, BibEntry one, BibEntry two) {
        double res = 0;
        double totWeights = 0.;
        for (String field : fields) {
            double weight;
            if (DuplicateCheck.FIELD_WEIGHTS.containsKey(field)) {
                weight = DuplicateCheck.FIELD_WEIGHTS.get(field);
            } else {
                weight = 1.0;
            }
            totWeights += weight;
            int result = DuplicateCheck.compareSingleField(field, one, two);
            if (result == EQUAL) {
                res += weight;
            } else if (result == EMPTY_IN_BOTH) {
                totWeights -= weight;
            }
        }
        if (totWeights > 0) {
            return new double[]{res / totWeights, totWeights};
        }
        return new double[] {0.5, 0.0};
    }

    private static int compareSingleField(String field, BibEntry one, BibEntry two) {
        Optional<String> optionalStringOne = one.getField(field);
        Optional<String> optionalStringTwo = two.getField(field);
        if (!optionalStringOne.isPresent()) {
            if (!optionalStringTwo.isPresent()) {
                return EMPTY_IN_BOTH;
            }
            return EMPTY_IN_ONE;
        } else if (!optionalStringTwo.isPresent()) {
            return EMPTY_IN_TWO;
        }

        // Both strings present
        String stringOne = optionalStringOne.get();
        String stringTwo = optionalStringTwo.get();

        if (InternalBibtexFields.getFieldProperties(field).contains(FieldProperty.PERSON_NAMES)) {
            // Specific for name fields.
            // Harmonise case:
            String authorOne = AuthorList.fixAuthorLastNameOnlyCommas(stringOne, false).replace(" and ", " ").toLowerCase(Locale.ROOT);
            String authorTwo = AuthorList.fixAuthorLastNameOnlyCommas(stringTwo, false).replace(" and ", " ").toLowerCase(Locale.ROOT);
            double similarity = DuplicateCheck.correlateByWords(authorOne, authorTwo);
            if (similarity > 0.8) {
                return EQUAL;
            }
            return NOT_EQUAL;
        } else if (FieldName.PAGES.equals(field)) {
            // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ".
            // We do a replace to harmonize these to a simple "-":
            // After this, a simple test for equality should be enough:
            stringOne = stringOne.replaceAll("[- ]+", "-");
            stringTwo = stringTwo.replaceAll("[- ]+", "-");
            if (stringOne.equals(stringTwo)) {
                return EQUAL;
            }
            return NOT_EQUAL;
        } else if (FieldName.JOURNAL.equals(field)) {
            // We do not attempt to harmonize abbreviation state of the journal names,
            // but we remove periods from the names in case they are abbreviated with
            // and without dots:
            stringOne = stringOne.replace(".", "").toLowerCase(Locale.ROOT);
            stringTwo = stringTwo.replace(".", "").toLowerCase(Locale.ROOT);
            double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo);
            if (similarity > 0.8) {
                return EQUAL;
            }
            return NOT_EQUAL;
        } else {
            stringOne = stringOne.toLowerCase(Locale.ROOT);
            stringTwo = stringTwo.toLowerCase(Locale.ROOT);
            double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo);
            if (similarity > 0.8) {
                return EQUAL;
            }
            return NOT_EQUAL;
        }
    }

    public static double compareEntriesStrictly(BibEntry one, BibEntry two) {
        Set<String> allFields = new HashSet<>();
        allFields.addAll(one.getFieldNames());
        allFields.addAll(two.getFieldNames());

        int score = 0;
        for (String field : allFields) {
            Optional<String> stringOne = one.getField(field);
            Optional<String> stringTwo = two.getField(field);
            if (stringOne.equals(stringTwo)) {
                score++;
            }
        }
        if (score == allFields.size()) {
            return 1.01; // Just to make sure we can
            // use score>1 without
            // trouble.
        }
        return (double) score / allFields.size();
    }

    /**
     * Goes through all entries in the given database, and if at least one of
     * them is a duplicate of the given entry, as per
     * Util.isDuplicate(BibEntry, BibEntry), the duplicate is returned.
     * The search is terminated when the first duplicate is found.
     *
     * @param database The database to search.
     * @param entry    The entry of which we are looking for duplicates.
     * @return The first duplicate entry found. null if no duplicates are found.
     */
    public static Optional<BibEntry> containsDuplicate(BibDatabase database, BibEntry entry, BibDatabaseMode bibDatabaseMode) {
        for (BibEntry other : database.getEntries()) {
            if (DuplicateCheck.isDuplicate(entry, other, bibDatabaseMode)) {
                return Optional.of(other); // Duplicate found.
            }
        }
        return Optional.empty(); // No duplicate found.
    }

    /**
     * Compare two strings on the basis of word-by-word correlation analysis.
     *
     * @param s1       The first string
     * @param s2       The second string
     * @return a value in the interval [0, 1] indicating the degree of match.
     */
    public static double correlateByWords(String s1, String s2) {
        String[] w1 = s1.split("\\s");
        String[] w2 = s2.split("\\s");
        int n = Math.min(w1.length, w2.length);
        int misses = 0;
        for (int i = 0; i < n; i++) {
            double corr = similarity(w1[i], w2[i]);
            if (corr < 0.75) {
                misses++;
            }
        }
        double missRate = (double) misses / (double) n;
        return 1 - missRate;
    }


    /**
     * Calculates the similarity (a number within 0 and 1) between two strings.
     * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
     */
    private static double similarity(String s1, String s2) {
        String longer = s1;
        String shorter = s2;

        if (s1.length() < s2.length()) {
            longer = s2;
            shorter = s1;
        }

        int longerLength = longer.length();
        // both strings are zero length
        if (longerLength == 0) {
            return 1.0;
        }
        double sim = (longerLength - new StringSimilarity().editDistanceIgnoreCase(longer, shorter)) / (double) longerLength;
        LOGGER.debug("Longer string: " + longer + " Shorter string: " + shorter + " Similarity: " + sim);
        return sim;
    }
}