package edu.northwestern.at.utils.corpuslinguistics.phonetics; /** Soundex: Implements the Soundex Algorithm. * * <p> * Soundex hashes words to a smaller space using a simple model * which approximates the sound of the word as produced by a native * American English speaker. The hash is a four (usually) character * string in which the first character is an uppercase letter and the * remaining characters are digits. Soundex was originally * intended only for encoding proper last names, but occasionally * finds other uses as well. The Soundex algorithm was devised and * patented by Margaret K. Odell and Robert C. Russell in 1918. * </p> */ public class Soundex { /* The Soundex mapping from letters to digits. */ public static final char[] US_ENGLISH_SOUNDEX_MAPPING = "01230120022455012623010202".toCharArray(); // ABCDEFGHIJKLMNOPQRSTUVWXYZ /* Maximum length of a Soundex code. 4 is the usual value. */ public static final int MAXSOUNDEXLENGTH = 4; /** Get Soundex code for a string. * * @param s The string for which the soundex code is desired. * * @return The Soundex code for "s". Returns an empty string * when the Soundex code cannot be found. In particular, * a Soundex code cannot be found if the first * character in "s" is not a letter (a-z, A-Z). */ public static String soundex( String s ) { // If the input string is null or empty, // we cannot find a Soundex code. if ( ( s == null ) || ( s.length() == 0 ) ) return ""; // String buffer holds generated // Soundex code. StringBuffer result = new StringBuffer(); // Convert input string to upper case. String sUpperCase = s.toUpperCase(); // Tracks the previous character in // the string being processed. char previousC = '0'; // The current character being processsed. char c; // If the first character of the // string to encode is not a letter, // we cannot find a Soundex code. c = sUpperCase.charAt( 0 ); if ( ( c < 'A' ) || ( c > 'Z' ) ) return ""; // First letter is appended to result // unchanged except for case. result.append( c ); // Convert remaining characters using // the Soundex mapped values until // a Soundex code of MAXSOUNDLENGTH // is reached, or the input string // is exhausted. for ( int i = 1 ; ( i < sUpperCase.length() ) && ( result.length() < MAXSOUNDEXLENGTH ) ; i++ ) { // Pick up the next character in the // input string. c = sUpperCase.charAt( i ); // Ignore this character if is not // a letter or is the same as // the previous character. if ( ( c >='A' ) && ( c <= 'Z' ) && ( c != previousC ) ) { // Set the previous character to this // character. previousC = c; // Get the Soundex value for this // character. char mappedC = US_ENGLISH_SOUNDEX_MAPPING[ c - 'A' ]; // Append the Soundex map value // to the Soundex code if the map // character is not a '0'. if ( mappedC != '0' ) { result.append( mappedC ); } } } // Pad Soundex code with trailing // '0' characters to bring the length // up to MAXSOUNDEXLENGTH. if ( result.length() > 0 ) { for ( int i = result.length() ; i < MAXSOUNDEXLENGTH ; i++ ) { result.append( '0' ); } } // Return resulting Soundex string. return result.toString(); } }