package de.berlin.hu.uima.ae.normalizer; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashSet; import java.util.Set; /** * A string comparator utility class. * * @author Torsten Huber * @version 03.06.2012 */ public class StringComparator { /** * Calculates the dice coefficient for two strings using default n-grams (bigrams). * * @param s1 the first string * @param s2 the second string * @return the dice coefficient */ public static float diceCoefficient(String name1, String name2) { return diceCoefficient(name1, name2, 2); } public static Set<String> getNGrams(String s, int n) { n = n < 2 ? 2 : n; s = s.trim().toLowerCase().replaceAll("\\s+", " "); char padChar = Character.MIN_VALUE; for (int i = 0 ; i < n-1; i++) { s = padChar + s + padChar; } // read n-grams Set<String> ngrams = new HashSet<String>(s.length() - n+2, (float)1.0); for (int i = 0 ; i < s.length() - n+1; i++) { ngrams.add(s.substring(i, i+n)); } return ngrams; } /** * Calculates the dice coefficient for two strings using n-grams. * * @param s1 the first string * @param s2 the second string * @param n the "n" in n-gram * @return the dice coefficient */ public static float diceCoefficient(Set<String> s1, Set<String> s2) { Set<String> intersection = new HashSet<String>(s1); intersection.retainAll(s2); // calculate dice coefficient return (float)2 * intersection.size() / (s1.size() + s2.size()); } /** * Calculates the dice coefficient for two strings using n-grams. * * @param s1 the first string * @param s2 the second string * @param n the "n" in n-gram * @return the dice coefficient */ public static float diceCoefficient(String s1, String s2, int n) { n = n < 2 ? 2 : n; // read n-grams Set<String> ngrams1 = getNGrams(s1, n); Set<String> ngrams2 = getNGrams(s1, n); Set<String> intersection = new HashSet<String>(ngrams1); intersection.retainAll(ngrams2); // calculate dice coefficient return (float)2 * intersection.size() / (ngrams1.size() + ngrams2.size()); } public static void main(String args[]) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("name 1: "); String name1 = reader.readLine(); System.out.print("name 2: "); String name2 = reader.readLine(); System.out.println("dice coefficient: " + diceCoefficient(name1, name2)); System.out.println(); } } }