package org.activityinfo.core.shared.importing.match.names; import com.google.common.annotations.VisibleForTesting; import static java.lang.Math.max; /** * Scores the similarity between two names names written in a Latin * script. * */ public class LatinPlaceNameScorer { public static final int MINIMUM_STRING_LENGTH_FOR_FUZZY_MATCHING = 1; private static final int NONE = -1; // We store the current names names that we're matching against // in a pair of flyweight classes to void creating bazillions // of little objects in the course of matching private final LatinPlaceName x = new LatinPlaceName(); private final LatinPlaceName y = new LatinPlaceName(); private final LatinWordDistance distanceFunction = new LatinWordDistance(); // Use a flyweight array for tracking permutations private final int permutation[] = new int[LatinPlaceName.MAX_PARTS]; public LatinPlaceNameScorer() { } public double score(String importedValue, String referencedValue) { // quick check... if(importedValue.equals(referencedValue)) { return 1.0; } // if we don't have an exact match, first normalize the // the strings into lists of lowercase parts free of // diacriticals or other messiness x.set(importedValue); y.set(referencedValue); if(x.isEmpty() || y.isEmpty()) { return 0.0; } // Now we have two sets of ordered components, for example: // [T1, T2, T3] and [S1, S2] // we want to know how likely it is that S refers to the // same entity as T. // Names are funny things and subject to a great deal of // violence by humans. There are a number of things that // can happen to a written name: // (1) The same sound may be written differently, perhaps because // of a different transliteration scheme, for example // // [ou]adi => [w]adi // zou[q] bha[nn]ine => zou[k] bha[n]ine // z[ai]toun => z[ei]toun[e] // // (2) Sounds can drift regionally and over time, and these // differences result in new spellings // // // (3) Names that include parts of speech can be reordered or // discarded arbitrarily // // [santa, rosa, city] => [city, of, santa, rosa] // [commune, de, goumera] => [goumera] // // (4) Words can be split or joined // // [bara, sara] => [barassara] // [nema, badenyakafo] => [nema, badenya, kafo] // // So we not only have to deal with fuzzy matching, but we have to deal with it // on multiple levels: // (1) Proportion of parts matching // (2) Combinations of parts // (3) Similarity between non-matching parts // So we start off by running through all the combinations. If we have // [A, B, C] and [X, Y] // We have to expand this into a set of mergings // [A, B, C] [X, Y] // [AB, C] x [XY] // [A, BC] // We can think of each break between parts as a bit, and since // we're only concerned with names names composed of a small // number of parts, we'll use a bit set to iterate through all // the combinations // first try with no merging double score = findBestPermutationScore(); return score; } @VisibleForTesting void init(String xs, String ys) { this.x.set(xs); this.y.set(ys); } /** * Given two names names like X = "COMMUNE DE KAYES" and Y = "KAYES COMMUNE", * score the similarity between each part X[i] and Y[j] and find the best * assignment from i -> j. * * <ul> * <li>score([KAYES, COMMUNE], [COMMUNE, DE])</li> * <li>score([KAYES, COMMUNE], [COMMUNE, KAYES])</li> * <li>score([KAYES, COMMUNE], [DE, KAYES]</li> * <li>score([KAYES, COMMUNE], [KAYES, COMMUNE]</li> * <li>etc</li> * </ul> * @return */ @VisibleForTesting double findBestPermutationScore() { // swap x and y if necessary so that // left.numParts <= right.numParts LatinPlaceName left, right; if(x.partCount() <= y.partCount()) { left = this.x; right = this.y; } else { left = this.y; right = this.x; } // find the similarity between each pair of parts in left and right int leftParts = left.partCount(); int rightParts = right.partCount(); double scores[][] = new double[leftParts][rightParts]; for(int i=0;i<leftParts;++i) { for(int j=0;j<rightParts;++j) { scores[i][j] = similarity(left, i, right, j); } } // now find the best partial permutation among the right name parts, taking // the score double bestScore = 0; // loop through each partial permutation of the right parts PartialPermutations.first(permutation, rightParts); do { double numerator = 0; double denominator = 0; // keep track of the parts from the right that // are not included in this permutation, they // need to be included in the denominator double extraRightParts = right.charCount(); for(int leftPart=0;leftPart<leftParts;++leftPart) { int rightPart = permutation[leftPart]; double score = scores[leftPart][rightPart]; int leftLength = left.charCount(leftPart); int rightLength = right.charCount(rightPart); // we use the minimum length of the word as the weight // to inflating short words that match longer words because // of lots of vowels int minLength = Math.min(leftLength, rightLength); numerator += score * (double)minLength; if(score > 0.0) { denominator += minLength; } else { denominator += leftLength + rightLength; } extraRightParts -= rightLength; } denominator += extraRightParts; bestScore = max(bestScore, numerator / denominator); } while(PartialPermutations.next(permutation, rightParts, leftParts)); return bestScore; } private double similarity(LatinPlaceName left, int leftPartIndex, LatinPlaceName right, int rightPartIndex) { int leftChars = left.charCount(leftPartIndex); int rightChars = right.charCount(rightPartIndex); // first try an exact comparison if(leftChars == rightChars) { boolean matchesExactly = true; for(int i=0;i!=leftChars;++i) { if(left.charAt(leftPartIndex, i) != right.charAt(rightPartIndex, i)) { matchesExactly = false; break; } } if(matchesExactly) { return 1.0; } } boolean numericLeft = left.isPartNumeric(leftPartIndex); boolean numericRight = right.isPartNumeric(rightPartIndex); if(numericLeft && numericRight) { if(left.parsePartAsInteger(leftPartIndex) == right.parsePartAsInteger(rightPartIndex)) { return 1.0; } else { return 0.0; } } else if(numericLeft) { return tryCompareNumericWithAlpha(left, leftPartIndex, right, rightPartIndex); } else if(numericRight) { return tryCompareNumericWithAlpha(right, rightPartIndex, left, leftPartIndex); } // now try an approximate match based on the phonetic shape return distanceFunction.similarity( left.chars, left.partStart(leftPartIndex), left.partStart(leftPartIndex + 1), right.chars, right.partStart(rightPartIndex), right.partStart(rightPartIndex + 1)); } private double tryCompareNumericWithAlpha(LatinPlaceName nameWithNumericPart, int numericPartIndex, LatinPlaceName nameWithAlphaPart, int alphaPartIndex) { int x = nameWithNumericPart.parsePartAsInteger(numericPartIndex); int y = nameWithAlphaPart.tryParsePartAsRomanNumeral(alphaPartIndex); if(x == y) { return 1.0; } else { return 0.0; } } }