package org.activityinfo.geoadmin;
import com.google.common.base.Strings;
import org.activityinfo.geoadmin.util.JaroWinklerDistance;
import org.apache.commons.lang3.StringUtils;
/**
* Utilities for matching place names.
*/
public class PlaceNames {
//
// /**
// * Calculates the similarity between two names based on Levenshtein edit
// * distance on a scale of 0 to 1.
// *
// * @return a score between 0=no similarity, 1=exact match
// */
// public static double similiarity(String a, String b) {
// String cleanA = cleanName(a);
// String cleanB = cleanName(b);
// double distance = StringUtils.getLevenshteinDistance(cleanA, cleanB);
// double maxDistance = Math.max(cleanA.length(), cleanB.length());
// return (maxDistance - distance) / maxDistance;
// }
private static final JaroWinklerDistance JARO_WINKLER = new JaroWinklerDistance();
/**
* Calculates the edit distance between two names.
*
* @param a
* @param b
* @return
*/
public static int distance(String a, String b) {
return StringUtils.getLevenshteinDistance(PlaceNames.cleanName(a), PlaceNames.cleanName(b));
}
/**
* Cleans up a name by removing all punctuation and non-letters.
*/
public static String cleanName(String name) {
StringBuilder cleaned = new StringBuilder();
for (int i = 0; i != name.length(); ++i) {
int cp = name.codePointAt(i);
if (Character.isLetter(cp)) {
cleaned.appendCodePoint(Character.toLowerCase(cp));
}
}
return cleaned.toString();
}
/**
* gets the similarity of the two strings using Jaro distance.
*
* @param string1 the first input string
* @param string2 the second input string
* @return a value between 0-1 of the similarity
*/
public static double similarity(final String string1, final String string2) {
if(Strings.isNullOrEmpty(string1) || Strings.isNullOrEmpty(string2)) {
return 0;
}
if(string1.equals(string2)) {
return 1.0;
}
String c1 = cleanName(string1);
String c2 = cleanName(string2);
float distance = JARO_WINKLER.getDistance(c1, c2);
if(distance < 1.0) {
// if it's an approximate match, penalize scores that involve
// short strings
if(c1.length() <= 3 || c2.length() <= 3) {
distance *= 0.7;
}
}
return distance;
}
}