/* Jazzy - a Java library for Spell Checking Copyright (C) 2001 Mindaugas Idzelis Full text of license can be found in LICENSE.txt This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package com.swabunga.spell.engine; import java.io.BufferedReader; import java.io.InputStreamReader; /** * This class is based on Levenshtein Distance algorithms, and it calculates how similar two words are. If the words are identitical, then * the distance is 0. The more that the words have in common, the lower the distance value. The distance value is based on how many * operations it takes to get from one word to the other. Possible operations are swapping characters, adding a character, deleting a * character, and substituting a character. The resulting distance is the sum of these operations weighted by their cost, which can be set * in the Configuration object. When there are multiple ways to convert one word into the other, the lowest cost distance is returned. <br/> * Another way to think about this: what are the cheapest operations that would have to be done on the "original" word to end up with the * "similar" word? Each operation has a cost, and these are added up to get the distance. <br/> * * @see com.swabunga.spell.engine.Configuration#COST_REMOVE_CHAR * @see com.swabunga.spell.engine.Configuration#COST_INSERT_CHAR * @see com.swabunga.spell.engine.Configuration#COST_SUBST_CHARS * @see com.swabunga.spell.engine.Configuration#COST_SWAP_CHARS * */ public class EditDistance { /** * JMH Again, there is no need to have a global class matrix variable in this class. I have removed it and made the getDistance static * final * * DMV: I refactored this method to make it more efficient, more readable, and simpler. I also fixed a bug with how the distance was * being calculated. You could get wrong distances if you compared ("abc" to "ab") depending on what you had setup your COST_REMOVE_CHAR * and EDIT_INSERTION_COST values to - that is now fixed. * * WRS: I added a distance for case comparison, so a misspelling of "i" would be closer to "I" than to "a". */ public static Configuration config = Configuration.getConfiguration(); public static final int getDistance(String word, String similar) { // get the weights for each possible operation final int costOfDeletingSourceCharacter = config.getInteger(Configuration.COST_REMOVE_CHAR); final int costOfInsertingSourceCharacter = config.getInteger(Configuration.COST_INSERT_CHAR); final int costOfSubstitutingLetters = config.getInteger(Configuration.COST_SUBST_CHARS); final int costOfSwappingLetters = config.getInteger(Configuration.COST_SWAP_CHARS); final int costOfChangingCase = config.getInteger(Configuration.COST_CHANGE_CASE); int a_size = word.length() + 1; int b_size = similar.length() + 1; int[][] matrix = new int[a_size][b_size]; matrix[0][0] = 0; for (int i = 1; i != a_size; ++i) { matrix[i][0] = matrix[i - 1][0] + costOfInsertingSourceCharacter; // initialize the first column } for (int j = 1; j != b_size; ++j) { matrix[0][j] = matrix[0][j - 1] + costOfDeletingSourceCharacter; // initalize the first row } word = " " + word; similar = " " + similar; for (int i = 1; i != a_size; ++i) { char sourceChar = word.charAt(i); for (int j = 1; j != b_size; ++j) { char otherChar = similar.charAt(j); if (sourceChar == otherChar) { matrix[i][j] = matrix[i - 1][j - 1]; // no change required, so just carry the current cost up continue; } int costOfSubst = costOfSubstitutingLetters + matrix[i - 1][j - 1]; // if needed, add up the cost of doing a swap int costOfSwap = Integer.MAX_VALUE; boolean isSwap = i != 1 && j != 1 && sourceChar == similar.charAt(j - 1) && word.charAt(i - 1) == otherChar; if (isSwap) { costOfSwap = costOfSwappingLetters + matrix[i - 2][j - 2]; } int costOfDelete = costOfDeletingSourceCharacter + matrix[i][j - 1]; int costOfInsertion = costOfInsertingSourceCharacter + matrix[i - 1][j]; int costOfCaseChange = Integer.MAX_VALUE; String strSrcChar = "" + sourceChar; String strOtherChar = "" + otherChar; if (strSrcChar.compareToIgnoreCase(strOtherChar) == 0) { costOfCaseChange = costOfChangingCase + matrix[i - 1][j - 1]; } matrix[i][j] = minimum(costOfSubst, costOfSwap, costOfDelete, costOfInsertion, costOfCaseChange); } } int cost = matrix[a_size - 1][b_size - 1]; if (false) { System.out.println(dumpMatrix(word, similar, matrix)); } return cost; } /** * For debugging, this creates a string that represents the matrix. To read the matrix, look at any square. That is the cost to get from * the partial letters along the top to the partial letters along the side. * * @param src * - the source string that the matrix columns are based on * @param dest * - the dest string that the matrix rows are based on * @param matrix * - a two dimensional array of costs (distances) * @return String */ static private String dumpMatrix(String src, String dest, int matrix[][]) { StringBuffer s = new StringBuffer(""); int cols = matrix.length; int rows = matrix[0].length; for (int i = 0; i < cols + 1; i++) { for (int j = 0; j < rows + 1; j++) { if (i == 0 && j == 0) { s.append("\n "); continue; } if (i == 0) { s.append("| "); s.append(dest.charAt(j - 1)); continue; } if (j == 0) { s.append(src.charAt(i - 1)); continue; } String num = Integer.toString(matrix[i - 1][j - 1]); int padding = 4 - num.length(); s.append("|"); for (int k = 0; k < padding; k++) { s.append(' '); } s.append(num); } s.append('\n'); } return s.toString(); } static private int minimum(int a, int b, int c, int d, int e) { int mi = a; if (b < mi) { mi = b; } if (c < mi) { mi = c; } if (d < mi) { mi = d; } if (e < mi) { mi = e; } return mi; } public static void main(String[] args) throws Exception { BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); while (true) { String input1 = stdin.readLine(); if (input1 == null || input1.length() == 0) { break; } String input2 = stdin.readLine(); if (input2 == null || input2.length() == 0) { break; } System.out.println(EditDistance.getDistance(input1, input2)); } System.out.println("done"); } }