/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.util; import java.util.Arrays; /** * Static utility methods for Strings */ final public class Strings { public static int commonPrefixIndex (String[] strings) { int prefixLen = strings[0].length(); for (int i = 1; i < strings.length; i++) { if (strings[i].length() < prefixLen) prefixLen = strings[i].length(); int j = 0; if (prefixLen == 0) return 0; while (j < prefixLen) { if (strings[i-1].charAt(j) != strings[i].charAt(j)) { prefixLen = j; break; } j++; } } return prefixLen; } public static String commonPrefix (String[] strings) { return strings[0].substring (0, commonPrefixIndex(strings)); } public static int count (String string, char ch) { int idx = -1; int count = 0; while ((idx = string.indexOf (ch, idx+1)) >= 0) { count++; }; return count; } public static double levenshteinDistance (String s, String t) { int n = s.length(); int m = t.length(); int d[][]; // matrix int i; // iterates through s int j; // iterates through t char s_i; // ith character of s char t_j; // jth character of t int cost; // cost if (n == 0) return 1.0; if (m == 0) return 1.0; d = new int[n+1][m+1]; for (i = 0; i <= n; i++) d[i][0] = i; for (j = 0; j <= m; j++) d[0][j] = j; for (i = 1; i <= n; i++) { s_i = s.charAt (i - 1); for (j = 1; j <= m; j++) { t_j = t.charAt (j - 1); cost = (s_i == t_j) ? 0 : 1; d[i][j] = minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost); } } int longer = (n > m) ? n : m; return (double)d[n][m] / longer; // Normalize to 0-1. } private static int minimum (int a, int b, int c) { int mi = a; if (b < mi) { mi = b; } if (c < mi) { mi = c; } return mi; } }