package at.lux.retrieval.metrics.impl;
/**
* Title: NGramSimilarity
* Description: provides a function for calculating the "distance" between 2 words.
* @author Mathias Lux, mlux@know-center.at
*/
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
public class NGramSimilarity {
HashMap<String, Integer> nGramList1;
HashMap<String, Integer> nGramList2;
int i, matchingtrigrams, n;
Enumeration enum1;
String tmp;
/**
* new Matcher with Trigrams
*/
public NGramSimilarity() {
nGramList1 = new HashMap<String, Integer>(12);
nGramList2 = new HashMap<String, Integer>(12);
i = 0;
n = 3;
}
/**
* new Matcher with n-grams
*/
public NGramSimilarity(int n) {
nGramList1 = new HashMap<String, Integer>(12);
nGramList2 = new HashMap<String, Integer>(12);
i = 0;
this.n = n;
}
/**
* returns "similarity" betwenn str1 and str2
*/
public float getSimilarity(String str1, String str2) {
nGramList1 = calculateNgrams(str1);
nGramList2 = calculateNgrams(str2);
HashSet<String> dimensions = new HashSet<String>(nGramList1.size() + nGramList2.size());
dimensions.addAll(nGramList1.keySet());
dimensions.addAll(nGramList2.keySet());
float sum = 0f;
int sum1 = 0;
int sum2 = 0;
for (String dim : dimensions) {
float factor1 = 0f;
float factor2 = 0f;
if (nGramList1.containsKey(dim)) {
Integer entry = nGramList1.get(dim);
factor1 = entry;
sum1 += entry * entry;
}
if (nGramList2.containsKey(dim)) {
Integer entry = nGramList2.get(dim);
factor2 = entry;
sum2 += entry * entry;
}
sum += factor1 * factor2;
}
float upper = sum;
float lower = (float) Math.sqrt((float) sum1 * (float) sum2);
return upper / lower;
}
private HashMap<String, Integer> calculateNgrams(String str) {
HashMap<String, Integer> map = new HashMap<String, Integer>(10);
for (i = 0; i < str.length() - (n - 1); i++) {
String gram = str.substring(i, i + n).toUpperCase();
if (map.containsKey(gram)) {
map.put(gram, map.get(gram) + 1);
} else
map.put(gram, 1);
}
return map;
}
}