package plugins.HarmonizationComponent; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.List; /** * This class has implemented Levenshtein distance algorithm so a similarity * score could be calculated between two sequences. The two input strings would * be tokenized depending on what nGrams we have specified. The default ngram is * 2 which can be changed in the constructor. The two groups of tokens will be * further used to work out the similarity score. In addition, by default a list * of stop words has been defined, in the method stringMatching(), one of the * parameters "removeStopWords" indicates whether the stop words will be used to * remove the useless or meaningless words from the String. This the stop words * could be customized by setStopWords(List<String> stopWords) or * setStopWords(String[] stopWords). * * How to use? LevenShteinDistanceModel model = new LevenShteinDistanceModel(2); * double similarityScore = model.stringMatching("Smoking", "Smoker", false); * System.out.println(similarityScore); * * The other way List<String> tokens_1 = model.createNGrams("Smoking", false); * List<String> tokens_2 = model.createNGrams("Have you smoked last year?", * true); //remove stop words! double similarityScore = * model.calculateScore(tokens_1, tokens_2); * * * @author Chao * */ public class LevenshteinDistanceModel { private int nGrams = 0; private String[] STOP_WORDS = { "a", "you", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours ", " ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "many" }; private List<String> STOPWORDSLIST = new ArrayList<String>(); public LevenshteinDistanceModel() { this.nGrams = 2; this.STOPWORDSLIST = convertArrayToList(STOP_WORDS); } public LevenshteinDistanceModel(int nGrams) { this.nGrams = nGrams; this.STOPWORDSLIST = convertArrayToList(STOP_WORDS); } public List<String> convertArrayToList(String[] inputArray) { List<String> convertedList = new ArrayList<String>(); for (int i = 0; i < inputArray.length; i++) { if (!convertedList.contains(inputArray[i])) { convertedList.add(inputArray[i]); } } return convertedList; } public double stringMatching(String query, String query_2, boolean removeStopWords) { double similarityScore = this.calculateScore(createNGrams(query.toLowerCase().trim(), removeStopWords), createNGrams(query_2.toLowerCase().trim(), removeStopWords)); return similarityScore; } public List<String> removeStopWords(List<String> listOfWords) { List<String> removedStopWordsList = new ArrayList<String>(); for (String eachWord : listOfWords) { if (STOPWORDSLIST == null) { removedStopWordsList.add(eachWord); } else if (!STOPWORDSLIST.contains(eachWord)) { removedStopWordsList.add(eachWord); } } return removedStopWordsList; } /** * //create n-grams tokens of the string. * * @param inputString * @param nGrams * @return */ public HashMap<String, List<String>> createNGrams(List<String> inputString, boolean removeStopWords) { HashMap<String, List<String>> normalizedInputString = new HashMap<String, List<String>>(); for (String eachString : inputString) { String[] wordsInString = eachString.split(" "); List<String> tokens = new ArrayList<String>(); List<String> removedStopWordsList = new ArrayList<String>(); if (removeStopWords == true) { removedStopWordsList = removeStopWords(convertArrayToList(wordsInString)); } else { removedStopWordsList = convertArrayToList(wordsInString); } // Padding the string for (String singleWord : removedStopWordsList) { // The s$ will be the produced from two words. singleWord = singleWord.toLowerCase(); singleWord = "^" + singleWord; singleWord = singleWord + "$"; for (int i = 0; i < singleWord.length(); i++) { if (i + nGrams < singleWord.length()) { tokens.add(singleWord.substring(i, i + nGrams)); } else { if (!tokens.contains(singleWord.substring(singleWord.length() - 2))) { tokens.add(singleWord.substring(singleWord.length() - 2).toLowerCase()); } } } } normalizedInputString.put(eachString, tokens); } return normalizedInputString; } /** * //create n-grams tokens of the string. * * @param inputString * @param nGrams * @return */ public List<String> createNGrams(String inputQuery, boolean removeStopWords) { List<String> tokens = new ArrayList<String>(); List<String> removedStopWordsList = new ArrayList<String>(); String[] wordsInString = inputQuery.split(" "); if (removeStopWords == true) { removedStopWordsList = removeStopWords(convertArrayToList(wordsInString)); } else { removedStopWordsList = convertArrayToList(wordsInString); } // Padding the string for (String singleWord : removedStopWordsList) { // The s$ will be the produced from two words. singleWord = singleWord.toLowerCase(); singleWord = "^" + singleWord; singleWord = singleWord + "$"; for (int i = 0; i < singleWord.length(); i++) { if (i + nGrams < singleWord.length()) { tokens.add(singleWord.substring(i, i + nGrams)); } else { if (!tokens.contains(singleWord.substring(singleWord.length() - 2))) { tokens.add(singleWord.substring(singleWord.length() - 2).toLowerCase()); } } } } return tokens; } /** * Calculate the levenshtein distance * * @param inputStringTokens * @param ontologyTermTokens * @return */ public double calculateScore(List<String> inputStringTokens, List<String> ontologyTermTokens) { int matchedTokens = 0; double similarity = 0; for (String eachToken : inputStringTokens) { if (ontologyTermTokens.contains(eachToken)) { matchedTokens++; } } double totalToken = Math.max(inputStringTokens.size(), ontologyTermTokens.size()); similarity = matchedTokens / totalToken * 100; DecimalFormat df = new DecimalFormat("#0.000"); return Double.parseDouble(df.format(similarity)); } public int getnGrams() { return this.nGrams; } public void setnGrams(int nGrams) { this.nGrams = nGrams; } public void setStopWords(String[] STOPWORDS) { this.STOP_WORDS = STOPWORDS; this.STOPWORDSLIST = convertArrayToList(STOP_WORDS); } public void setStopWords(List<String> STOPWORDS) { this.STOPWORDSLIST = STOPWORDS; } }