package at.lux.retrieval.metrics; import at.lux.retrieval.metrics.impl.NGramSimilarity; /** * This class provides a metric based on n-grams. The input strings are converted * to vectors based on their n-sized substrings with pre- and appended whitespaces. * The similarity is calculated by computing the cosine coefficient between them<p/> * Example:<br> * "house" is converted to (" HO", "HOU", "OUS", "USE", "SE ") <br> * "mouse" is converted to (" MO", "MOU", "OUS", "USE", "SE ") <br> * So the vecors are:<br> * (1,1,1,1,1,0,0) <br> * (0,0,1,1,1,1,1) <br> * with the dimensions (" HO", "HOU", "OUS", "USE", "SE ", " MO", "MOU")<p> * Date: 01.09.2005 <br> * Time: 10:19:02 <br> * @author Mathias Lux, mlux@know-center.at */ public class NGramStringMetric implements StringMetric{ public float getSimilarity(String str1, String str2) { // if no ngram can be created return a similarity of 0. String s1 = " " + str1.trim() + " "; String s2 = " " + str2.trim() + " "; if (Math.min(s1.length(), s2.length()) < 3) return 0f; NGramSimilarity ngrams = new NGramSimilarity(); return ngrams.getSimilarity(s1, s2); } public float getDistance(String str1, String str2) { return 1f-getSimilarity(str1, str2); } }