/** * */ package org.commcare.android.util; import java.text.Normalizer; import java.util.HashMap; import java.util.regex.Pattern; import android.support.v4.util.LruCache; /** * @author ctsims * */ public class StringUtils { //TODO: Bro you can't just cache every fucking string ever. static LruCache<String, String> normalizationCache; static Pattern diacritics; //TODO: Really not sure about this size. Also, the LRU probably isn't really the best model here //since we'd _like_ for these caches to get cleaned up at _some_ point. static final private int cacheSize = 100 * 1024; /** * @param input A non-null string * @return a canonical version of the passed in string that is lower cased and has removed diacritical marks * like accents. */ public synchronized static String normalize(String input) { if(normalizationCache == null) { normalizationCache = new LruCache<String, String>(cacheSize); diacritics = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); } String normalized = normalizationCache.get(input); if(normalized != null) { return normalizationCache.get(input);} normalized = diacritics.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll("").toLowerCase(); normalizationCache.put(input, normalized); return normalized; } /** * Computes the Levenshtein Distance between two strings. * * This code is sourced and unmodified from wikibooks under * the Creative Commons attribution share-alike 3.0 license and * by be re-used under the terms of that license. * * http://creativecommons.org/licenses/by-sa/3.0/ * * TODO: re-implement for efficiency/licensing possibly. * * @param s0 * @param s1 * * @return */ public static int LevenshteinDistance (String s0, String s1) { int len0 = s0.length()+1; int len1 = s1.length()+1; // the array of distances int[] cost = new int[len0]; int[] newcost = new int[len0]; // initial cost of skipping prefix in String s0 for(int i=0;i<len0;i++) cost[i]=i; // dynamicaly computing the array of distances // transformation cost for each letter in s1 for(int j=1;j<len1;j++) { // initial cost of skipping prefix in String s1 newcost[0]=j-1; // transformation cost for each letter in s0 for(int i=1;i<len0;i++) { // matching current letters in both strings int match = (s0.charAt(i-1)==s1.charAt(j-1))?0:1; // computing cost for each transformation int cost_replace = cost[i-1]+match; int cost_insert = cost[i]+1; int cost_delete = newcost[i-1]+1; // keep minimum cost newcost[i] = Math.min(Math.min(cost_insert, cost_delete),cost_replace ); } // swap cost/newcost arrays int[] swap=cost; cost=newcost; newcost=swap; } // the distance is the cost for transforming all letters in both strings return cost[len0-1]; } /** * Identifies whether two strings are close enough that they are likely to be * intended to be the same string. Fuzzy matching is only performed on strings that are * longer than a certain size. * * * @param a * @param b * @return true if the two strings meet CommCare's fuzzy match definition, false otherwise. */ public static boolean fuzzyMatch(String a, String b) { //tweakable parameter: Minimum length before edit distance //starts being used (this is probably not necessary, and //basically only makes sure that "at" doesn't match "or" or similar if(b.length() > 3) { int sizeDiff = Math.abs(a.length() - b.length()); int distance = StringUtils.LevenshteinDistance(a, b); //tweakable parameter: edit distance past string length disparity if(distance <= 2) { return true; } } return false; } }