StringUtils.java example

Explorer
commcare-odk-master
/**
 * 
 */
package org.commcare.android.util;

import java.text.Normalizer;
import java.util.HashMap;
import java.util.regex.Pattern;

import android.support.v4.util.LruCache;

/**
 * @author ctsims
 *
 */
public class StringUtils {
    
    //TODO: Bro you can't just cache every fucking string ever.
    static LruCache<String, String> normalizationCache;
    
    static Pattern diacritics;
    
    //TODO: Really not sure about this size. Also, the LRU probably isn't really the best model here
    //since we'd _like_ for these caches to get cleaned up at _some_ point.
    static final private int cacheSize = 100 * 1024;
    
    /**  
     * @param input A non-null string
     * @return a canonical version of the passed in string that is lower cased and has removed diacritical marks
     * like accents. 
     */
    public synchronized static String normalize(String input) {
        if(normalizationCache == null) {
            normalizationCache = new LruCache<String, String>(cacheSize);

            diacritics = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
        }
        String normalized = normalizationCache.get(input);
        if(normalized != null) { return normalizationCache.get(input);}
        
        normalized = diacritics.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll("").toLowerCase();
        
        normalizationCache.put(input, normalized);
        
        return normalized;
    }
    
    /**
     * Computes the Levenshtein Distance between two strings.
     * 
     * This code is sourced and unmodified from wikibooks under 
     * the Creative Commons attribution share-alike 3.0 license and
     * by be re-used under the terms of that license.
     * 
     * http://creativecommons.org/licenses/by-sa/3.0/
     * 
     * TODO: re-implement for efficiency/licensing possibly.
     * 
     * @param s0
     * @param s1
     * 
     * @return 
     */
    public static int LevenshteinDistance (String s0, String s1) {
        int len0 = s0.length()+1;
        int len1 = s1.length()+1;
     
        // the array of distances
        int[] cost = new int[len0];
        int[] newcost = new int[len0];
     
        // initial cost of skipping prefix in String s0
        for(int i=0;i<len0;i++) cost[i]=i;
     
        // dynamicaly computing the array of distances
     
        // transformation cost for each letter in s1
        for(int j=1;j<len1;j++) {
     
            // initial cost of skipping prefix in String s1
            newcost[0]=j-1;
     
            // transformation cost for each letter in s0
            for(int i=1;i<len0;i++) {
     
                // matching current letters in both strings
                int match = (s0.charAt(i-1)==s1.charAt(j-1))?0:1;
     
                // computing cost for each transformation
                int cost_replace = cost[i-1]+match;
                int cost_insert  = cost[i]+1;
                int cost_delete  = newcost[i-1]+1;
     
                // keep minimum cost
                newcost[i] = Math.min(Math.min(cost_insert, cost_delete),cost_replace );
            }
     
            // swap cost/newcost arrays
            int[] swap=cost; cost=newcost; newcost=swap;
        }
     
        // the distance is the cost for transforming all letters in both strings
        return cost[len0-1];
    }

    /**
     * Identifies whether two strings are close enough that they are likely to be 
     * intended to be the same string. Fuzzy matching is only performed on strings that are
     * longer than a certain size.
     * 
     * 
     * @param a 
     * @param b
     * @return true if the two strings meet CommCare's fuzzy match definition, false otherwise.
     */
    public static boolean fuzzyMatch(String a, String b) {
        //tweakable parameter: Minimum length before edit distance
        //starts being used (this is probably not necessary, and
        //basically only makes sure that "at" doesn't match "or" or similar
        if(b.length() > 3) {
            int sizeDiff = Math.abs(a.length() - b.length());
            int distance = StringUtils.LevenshteinDistance(a, b);
            //tweakable parameter: edit distance past string length disparity
            if(distance <= 2) {
                return true;
            }
        }
        return false;
    }
}