package edu.northwestern.at.utils.corpuslinguistics.phonetics;
/* Please see the license information in the header below. */
/**
* This code is based on an implementation by Ed Parrish, which was
* obtained from:
*
* http://www.cse.ucsc.edu/~eparrish/toolbox/search.html
*
* Licensed under an Apache license in the
* org.apache.commons.codec.language package.
*/
public class DoubleMetaphone
{
private int current;
private int encodeLimit = 4;
private StringBuffer primary = new StringBuffer();
private StringBuffer alternate = new StringBuffer();
private String input;
private final static char[] vowels = {'A', 'E', 'I', 'O', 'U', 'Y'};
private final static char[] AEOU = {'A', 'E', 'O', 'U'};
private final static char[] AO = "AO".toCharArray();
private final static char[] BDH = {'B', 'D', 'H'};
private final static char[] BFHLMNRVW_ = "BFHLMNRVW ".toCharArray();
private final static char[] BH = {'B', 'H'};
private final static char[] BKLMNSTZ = "LTKSNMBZ".toCharArray();
private final static char[] BP = "BP".toCharArray();
private final static char[] CGQ = {'C', 'G', 'Q'};
private final static char[] CGLRT = {'C', 'G', 'L', 'R', 'T'};
private final static char[] CKQ = {'C', 'K', 'Q'};
private final static char[] CX = "CX".toCharArray();
private final static char[] DT = "DT".toCharArray();
private final static char[] EI = {'E', 'I'};
private final static char[] EIY = {'E', 'I', 'Y'};
private final static char[] EHI = {'I', 'E', 'H'};
private final static char[] KLS = "KLS".toCharArray();
private final static char[] LMNW = "LMNW".toCharArray();
private final static char[] ST = {'S', 'T'};
private final static char[] SZ = "SZ".toCharArray();
private final static String[] AggiOggi = {"AGGI", "OGGI"};
private final static String[] AiOi = {"AI", "OI"};
private final static String[] AlleIllaIllo = {"ILLO", "ILLA", "ALLE"};
private final static String[] AmOm = {"OM", "AM"};
private final static String[] AsOs = {"AS", "OS"};
private final static String[] ArchitOrchesOrchid = {"ARCHIT", "ORCHES", "ORCHID"};
private final static String[] AuOu = {"AU", "OU"};
private final static String[] BacherMacher = {"BACHER", "MACHER"};
private final static String[] CeCiCy = {"CI", "CE", "CY"};
private final static String[] CeCi = {"CE", "CI"};
private final static String[] CiaCieCio = {"CIO", "CIE", "CIA"};
private final static String[] CkCgCq = {"CK", "CG", "CQ"};
private final static String[] DangerMangerRanger = {"DANGER", "RANGER", "MANGER"};
private final static String[] DdDt = {"DD", "DT"};
private final static String[] EauIau = {"IAU", "EAU"};
private final static String[] EbEiElEpErEsEyIbIlInIe = {"ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"};
private final static String[] EdEmEnErOoUy = {"OO", "ER", "EN", "UY", "ED", "EM"};
private final static String[] EnEr = {"ER", "EN"};
private final static String[] EwskiEwskyOwskiOwsky = {"EWSKI", "EWSKY", "OWSKI", "OWSKY"};
private final static String[] GnKnPnPsWr = {"GN", "KN", "PN", "WR", "PS"};
private final static String[] HaracHaris = {"HARAC", "HARIS"};
private final static String[] HeimHoekHolmHolz = {"HEIM", "HOEK", "HOLM", "HOLZ"};
private final static String[] HemHiaHorHym = {"HOR", "HYM", "HIA", "HEM"};
private final static String[] IslYsl = {"ISL", "YSL"};
private final static String[] MaMe = {"ME", "MA"};
private final static String[] OgyRgy = {"RGY", "OGY"};
private final static String[] SiaSio = {"SIO", "SIA"};
private final static String[] TiaTch = {"TIA", "TCH"};
private final static String[] UcceeUcces = {"UCCEE", "UCCES"};
private final static String[] Van_Von_ = {"VAN ", "VON "};
private final static String[] WiczWitz = {"WICZ", "WITZ"};
private final static String[] ZaZiZo = {"ZO", "ZI", "ZA"};
/** Creates new DoubleMetaphone */
public DoubleMetaphone() {
}
public String getPrimary() {
return primary.toString();
}
public StringBuffer getPrimaryBuffer() {
return primary;
}
public String getAlternate() {
return alternate.toString();
}
public StringBuffer getAlternateBuffer() {
return alternate;
}
public int getEncodeLimit() {
return encodeLimit;
}
public boolean setEncodeLimit(int newLimit) {
if (newLimit < 1) return false;
encodeLimit = newLimit;
return true;
}
void setInput(String in) {
if (in != null) {
input = in.toUpperCase() + " ";
} else {
input = "";
}
}
void add(char ch) {
add(ch, ch);
}
void add(char primaryChar, char alternateChar) {
primary.append(primaryChar);
alternate.append(alternateChar);
}
boolean charAt(int index, char[] list) {
if (index < 0 || index >= input.length()) return false;
char value = input.charAt(index);
for (int i = 0; i < list.length; i++) {
if (value == list[i]) return true;
}
return false;
}
boolean stringAt(int start, int length, String str) {
String[] list = new String[1];
list[0] = str;
return stringAt(start, length, list);
}
boolean stringAt(int start, int length, String[] list) {
if (length <= 0) return false;
for (int i = 0; i < list.length; i++) {
if (input.regionMatches(start, list[i], 0, length)) return true;
}
return false;
}
boolean isVowel(int index) {
return charAt(index, vowels);
}
boolean isSlavoGermanic() {
if((input.indexOf('W') > -1) || (input.indexOf('K') > -1)
|| (input.indexOf("CZ") > -1) || (input.indexOf("WITZ") > -1)) {
return true;
}
return false;
}
void addCode(char ch, char code) {
add(code);
current++;
if(input.charAt(current) == ch) current++;
}
public static String sencode( String in ) {
DoubleMetaphone dm = new DoubleMetaphone();
return dm.encode(in);
}
public String encode(String in) {
if (in == null) return "";
primary.delete(0, primary.length());
alternate.delete(0, alternate.length());
int length = in.length();
if (length < 1) return "";
int last = length - 1; //zero based index
setInput(in);
current = 0;
//skip these when at start of word
if (stringAt(0, 2, GnKnPnPsWr)) current++;
//Initial 'X' is pronounced 'Z' e.g. 'Xavier'
if(input.startsWith("X")) {
add('S'); //'Z' maps to 'S'
current++;
}
while (primary.length() < encodeLimit || alternate.length() < encodeLimit) {
if(current >= length) break;
switch(input.charAt(current)) {
case '�':
current++;
add('N');
break;
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
case 'Y':
if (current == 0) add('A'); // all init vowels map to 'A'
current++;
break;
case 'B':
// "-mb", e.g "dumb", already skipped over...
addCode('B', 'P');
break;
case '�':
add('S');
current++;
// Note: no doublecheck
break;
case 'C':
// various germanic
if((current > 1) && !isVowel(current - 2)
&& input.regionMatches(current - 1, "ACH", 0, 3)
&& (input.charAt(current + 2) != 'I'
&& input.charAt(current + 2) != 'E'
|| stringAt(current - 2, 6, BacherMacher) )) {
add('K');
current +=2;
break;
}
// special case 'caesar'
if (current == 0
&& input.regionMatches(current, "CAESAR", 0, 6)) {
add('S');
current +=2;
break;
}
//italian 'chianti'
if (input.regionMatches(current, "CHIA", 0, 4)) {
add('K');
current +=2;
break;
}
if (input.regionMatches(current, "CH", 0, 2)) {
//find 'michael'
if(current > 0
&& input.regionMatches(current, "CHAE", 0, 4)) {
add('K', 'X');
current +=2;
break;
}
// greek roots e.g. 'chemistry', 'chorus'
if (current == 0
&& (stringAt(current + 1, 5, HaracHaris)
|| stringAt((current + 1), 3, HemHiaHorHym))
&& !input.regionMatches(0, "CHORE", 0, 5)) {
add('K');
current +=2;
break;
}
// germanic, greek, or otherwise 'ch' for 'kh' sound
if ((stringAt(0, 4, Van_Von_)
|| input.regionMatches(0, "SCH ", 0, 3))
// 'architect' but not 'arch', 'orchestra', 'orchid'
|| stringAt(0, 6, ArchitOrchesOrchid)
|| charAt(current + 2, ST)
|| ((charAt(current - 1, AEOU)
|| current == 0)
// e.g. 'wachtler', 'wechsler', but not 'tichner'
&& charAt(current + 2, BFHLMNRVW_))) {
add('K');
} else {
if (current > 0) {
if (input.regionMatches(0, "MC", 0, 2)) {
// e.g. "McHugh"
add('K');
} else {
add('X', 'K');
}
} else {
add('X');
}
}
current +=2;
break;
}
// e.g. 'czerny'
if (input.regionMatches(current, "CZ", 0, 2)
&& !input.regionMatches(current - 2, "WICZ", 0, 4)) {
add('S', 'X');
current += 2;
break;
}
// e.g. 'focaccia'
if (input.regionMatches(current + 1, "CIA", 0, 3)) {
add('X');
current += 3;
break;
}
// double 'C', but not if e.g. 'McClellan'
if (input.regionMatches(current, "CC", 0, 2)
&& !((current == 1) && (input.charAt(0) == 'M'))) {
// 'bellocchio' but not 'bacchus'
if (charAt(current + 2, EHI)
&& !input.regionMatches(current + 2, "HU", 0, 2)) {
// 'accident', 'accede' 'succeed'
if(((current == 1) && (input.charAt(current - 1) == 'A'))
|| stringAt(current - 1, 5, UcceeUcces)) {
add('K');
add('S');
} else { // 'bacci', 'bertucci', other italian
add('X');
}
current += 3;
break;
} else { // Pierce's rule
add('K');
current += 2;
break;
}
}
if (stringAt(0, 2, CkCgCq)) {
add('K');
current += 2;
break;
}
if (stringAt(0, 2, CeCiCy)) {
// italian vs. english
if (stringAt(0, 3, CiaCieCio)) {
add('S', 'X');
} else {
add('S');
}
current += 2;
break;
}
// else
add('K');
// name sent in 'mac caffrey', 'mac gregor'
if (charAt(current + 1, CGQ)) {
current += 3;
} else {
if (charAt(current + 1, CKQ)
&& !stringAt(current + 1, 2, CeCi)) {
current += 2;
} else {
current++;
}
}
break;
case 'D':
if(input.regionMatches(current, "DG", 0, 2)) {
if (charAt(current + 2, EIY)) {
//e.g. 'edge'
add('J');
current += 3;
break;
} else {
//e.g. 'edgar'
add('T');
add('K');
current += 2;
break;
}
}
if (stringAt(current, 2, DdDt)) {
add('T');
current += 2;
break;
}
//else
add('T');
current++;
break;
case 'F': // NTR: this is typical default behavior
addCode('F', 'F');
break;
case 'G':
if (input.charAt(current + 1) == 'H') {
if (current > 0 && !isVowel(current - 1)) {
add('K');
current += 2;
break;
}
if (current < 3) {
// 'ghislane', 'ghiradelli'
if (current == 0) {
if (input.charAt(current + 2) == 'I') {
add('J');
} else {
add('K');
}
current += 2;
break;
}
}
//Parker's rule (with some further refinements) - e.g., 'hugh'
if((current > 1 && charAt(current - 2, BDH))
//e.g., 'bough'
|| (current > 2 && charAt(current - 3, BDH ))
//e.g., 'broughton'
|| (current > 3 && charAt(current - 4, BH)) ) {
current += 2;
break;
} else {
//e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
if (current > 2 && input.charAt(current - 1) == 'U'
&& charAt(current - 3, CGLRT) ) {
add('F');
} else {
if (current > 0 && input.charAt(current - 1) != 'I') {
add('K');
}
}
current += 2;
break;
}
}
boolean slavoGermanic = isSlavoGermanic();
if (input.charAt(current + 1) == 'N') {
if (current == 1 && isVowel(0) && !slavoGermanic) {
primary.append('K');
add('N');
} else {
//not e.g. 'cagney'
if (!input.regionMatches(current + 2, "EY", 0, 2)
&& (input.charAt(current + 1) != 'Y')
&& !slavoGermanic) {
alternate.append('K');
add('N');
} else {
add('K');
add('N');
}
current += 2;
break;
}
}
//'tagliaro'
if (input.regionMatches(current + 1, "LI", 0, 2)
&& !slavoGermanic) {
primary.append('K');
add('L');
current += 2;
break;
}
//-ges-,-gep-,-gel-, -gie- at beginning
if((current == 0)
&& (input.charAt(current + 1) == 'Y'
|| stringAt(current + 1, 2, EbEiElEpErEsEyIbIlInIe)) ) {
add('K', 'J');
current += 2;
break;
}
// -ger-, -gy-
if ((input.regionMatches(current + 1, "ER", 0, 2)
|| input.charAt(current + 1) == 'Y')
&& !stringAt(0, 6, DangerMangerRanger)
&& !charAt(current - 1, EI)
&& !stringAt(current - 1, 3, OgyRgy) ) {
add('K', 'J');
current += 2;
break;
}
// italian e.g, 'biaggi'
if (charAt(current + 1, EIY)
|| stringAt(current - 1, 4, AggiOggi)) {
//obvious germanic
if ((stringAt(0, 4, Van_Von_)
|| input.regionMatches(0, "SCH", 0, 3))
|| input.regionMatches(current + 1, "ET", 0, 2)) {
add('K');
} else {
//always soft if french ending
if (input.regionMatches(current + 1, "IER ", 0, 4)) {
add('J');
} else {
add('J', 'K');
}
current += 2;
break;
}
}
if (input.charAt(current + 1) == 'G') {
current += 2;
} else {
current++;
}
add('K');
break;
case 'H':
// only keep if first & before vowel or btw. 2 vowels
if ((current == 0 || isVowel(current - 1))
&& isVowel(current + 1)) {
add('H');
current += 2;
} else { // also takes care of 'HH'
current++;
}
break;
case 'J':
//obvious spanish, 'jose', 'san jacinto'
if (stringAt(current, 4, "JOSE") || stringAt(0, 4, "SAN ")) {
if ((current == 0 && (input.charAt(current + 4) == ' '))
|| stringAt(0, 4, "SAN ")) {
add('H');
} else {
add('J', 'H');
}
current +=1;
break;
}
if (current == 0 && !stringAt(current, 4, "JOSE")) {
add('J', 'A'); // Yankelovich/Jankelowicz
} else {
// spanish pron. of e.g. 'bajador'
if (isVowel(current - 1) && !isSlavoGermanic()
&& ((input.charAt(current + 1) == 'A')
|| (input.charAt(current + 1) == 'O'))) {
add('J', 'H');
} else {
if (current == last) {
add('J', ' ');
} else {
if (!charAt(current + 1, BKLMNSTZ)
&& !charAt(current - 1, KLS)) {
add('J');
}
}
}
}
current++;
if(input.charAt(current) == 'J') current++; // doublecheck
break;
case 'K': // NTR: this is typical default behavior
addCode('K', 'K');
break;
case 'L':
if (input.charAt(current + 1) == 'L') {
//spanish e.g. 'cabrillo', 'gallegos'
if (((current == (length - 3))
&& stringAt(current - 1, 4, AlleIllaIllo))
|| ((stringAt((last - 1), 2, AsOs)
|| charAt(last, AO))
&& stringAt(current - 1, 4, "ALLE")) ) {
primary.append('L');
current += 2;
break;
}
current += 2;
} else {
current++;
}
add('L');
break;
case 'M':
if ((stringAt(current - 1, 3, "UMB")
&& (((current + 1) == last)
|| stringAt(current + 2, 2, "ER")))
//'dumb','thumb'
|| (input.charAt(current + 1) == 'M') ) {
current += 2;
} else {
current++;
}
add('M');
break;
case 'N': // NTR: this is typical default behavior
addCode('N', 'N');
break;
case 'P':
if (input.charAt(current + 1) == 'H') {
add('F');
current += 2;
break;
}
//also account for 'campbell', 'raspberry'
if (charAt(current + 1, BP))
current += 2;
else
current++;
add('P');
break;
case 'Q': // NTR: this is typical default behavior
addCode('Q', 'K');
break;
case 'R':
//french e.g. 'rogier', but exclude 'hochmeier'
if ((current == last)
&& !isSlavoGermanic()
&& stringAt(current - 2, 2, "IE")
&& !stringAt(current - 4, 2, MaMe)) {
alternate.append('R');
} else {
add('R');
}
current++;
if(input.charAt(current) == 'R') current++; // doublecheck
break;
case 'S':
//special cases 'island', 'isle', 'carlisle', 'carlysle'
if (stringAt(current - 1, 3, IslYsl)) {
current++;
break;
}
//special case 'sugar-'
if ((current == 0) && stringAt(current, 5, "SUGAR")) {
add('X', 'S');
current++;
break;
}
if (stringAt(current, 2, "SH")) {
//germanic
if (stringAt(current + 1, 4, HeimHoekHolmHolz)) {
add('S');
} else {
add('X');
}
current += 2;
break;
}
//italian & armenian
if (stringAt(current, 3, SiaSio)
|| stringAt(current, 4, "SIAN")) {
if (!isSlavoGermanic()) {
add('S', 'X');
} else {
add('S');
}
current += 3;
break;
}
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
//also, -sz- in slavic language altho in hungarian it is pronounced 's'
if ((current == 0 && charAt(current + 1, LMNW))
|| input.charAt(current + 1) == 'Z') {
add('S', 'X');
if (input.charAt(current + 1) == 'Z') {
current += 2;
} else {
current++;
}
break;
}
if (stringAt(current, 2, "SC")) {
//Schlesinger's rule
if (input.charAt(current + 2) == 'H') {
//dutch origin, e.g. 'school', 'schooner'
if (stringAt(current + 3, 2, EdEmEnErOoUy)) {
//'schermerhorn', 'schenker'
if (stringAt((current + 3), 2, EnEr)) {
add('X', 'S');
alternate.append('K');
} else {
add('S');
add('K');
}
current += 3;
break;
} else {
if (current == 0 && !isVowel(3)
&& input.charAt(3) != 'W') {
add('X', 'S');
} else {
add('X');
}
current += 3;
break;
}
}
if (charAt(current + 2, EIY)) {
add('S');
current += 3;
break;
}
//else
add('S');
add('K');
current += 3;
break;
}
//french e.g. 'resnais', 'artois'
if (current == last && stringAt(current - 2, 2, AiOi)) {
alternate.append('S');
} else {
add('S');
}
if (charAt(current + 1, SZ)) {
current += 2;
} else {
current++;
}
break;
case 'T':
if (stringAt(current, 4, "TION")) {
add('X');
current += 3;
break;
}
if (stringAt(current, 3, TiaTch)) {
add('X');
current += 3;
break;
}
if (stringAt(current, 2, "TH") || stringAt(current, 3, "TTH")) {
//special case 'thomas', 'thames' or germanic
if (stringAt(current + 2, 2, AmOm)
|| stringAt(0, 4, Van_Von_)
|| stringAt(0, 3, "SCH")) {
add('T');
} else {
add('0', 'T');
}
current += 2;
break;
}
if (charAt(current + 1, DT))
current += 2;
else
current++;
add('T');
break;
case 'V': // NTR: this is typical default behavior
addCode('V', 'F');
break;
case 'W':
//can also be in middle of word
if (stringAt(current, 2, "WR")) {
add('R');
current += 2;
break;
}
if (current == 0 && (isVowel(current + 1)
|| stringAt(current, 2, "WH"))) {
//Wasserman should match Vasserman
if (isVowel(current + 1)) {
add('A', 'F');
} else {
//need 'Uomo' to match 'Womo'
add('A');
}
}
//'Arnow' should match 'Arnoff'
if ((current == last && isVowel(current - 1))
|| stringAt(current - 1, 5, EwskiEwskyOwskiOwsky)
|| stringAt(0, 3, "SCH")) {
alternate.append('F');
current +=1;
break;
}
//polish e.g. 'filipowicz'
if (stringAt(current, 4, WiczWitz)) {
add('T', 'F');
add('S', 'X');
current +=4;
break;
}
//else skip it
current +=1;
break;
case 'X':
//french e.g. breaux
if (!(current == last && (stringAt((current - 3), 3, EauIau)
|| stringAt((current - 2), 2, AuOu))) ) {
add('K');
add('S');
}
if (charAt(current + 1, CX)) {
current += 2;
} else {
current++;
}
break;
case 'Z':
//chinese pinyin e.g. 'zhao'
if (input.charAt(current + 1) == 'H') {
add('J');
current += 2;
break;
} else {
if (stringAt(current + 1, 2, ZaZiZo)
|| (isSlavoGermanic() && (current > 0
&& input.charAt(current - 1) != 'T'))) {
alternate.append('T');
add('S');
} else {
add('S');
}
}
if (input.charAt(current + 1) == 'Z') {
current += 2;
} else {
current++;
}
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
add(input.charAt(current));
current++;
break;
default:
current++;
} // switch
} // while
// Only give back the specified length
if (primary.length() > encodeLimit) {
primary.delete(encodeLimit, primary.length());
}
if (alternate.length() > encodeLimit) {
alternate.delete(encodeLimit, alternate.length());
}
return primary.toString();
}
}