package org.gbif.checklistbank.lucene; import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** * Based on character transpositions found in Tony Reese's TaxonMatch. */ public class ScientificNameSoundAlikeFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /** * Construct a token stream filtering the given input. */ public ScientificNameSoundAlikeFilter(TokenStream input) { super(input); } @Override public final boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; final char[] buffer = termAtt.buffer(); final int bufferLength = termAtt.length(); // Do some selective replacement on the leading letter/s only: if (bufferLength > 2) { String start3 = new String(buffer, 0, 2); if (start3.startsWith("ae")) { start3 = "E" + start3.substring(2); } else if (start3.startsWith("cn")) { start3 = "N" + start3.substring(2); } else if (start3.startsWith("ct")) { start3 = "T" + start3.substring(2); } else if (start3.startsWith("cz")) { start3 = "C" + start3.substring(2); } else if (start3.startsWith("dj")) { start3 = "J" + start3.substring(2); } else if (start3.startsWith("ea")) { start3 = "E" + start3.substring(2); } else if (start3.startsWith("eu")) { start3 = "U" + start3.substring(2); } else if (start3.startsWith("gn")) { start3 = "N" + start3.substring(2); } else if (start3.startsWith("kn")) { start3 = "N" + start3.substring(2); } else if (start3.startsWith("mc")) { start3 = "MAC" + start3.substring(2); } else if (start3.startsWith("mn")) { start3 = "N" + start3.substring(2); } else if (start3.startsWith("oe")) { start3 = "E" + start3.substring(2); } else if (start3.startsWith("qu")) { start3 = "Q" + start3.substring(2); } else if (start3.startsWith("ps")) { start3 = "S" + start3.substring(2); } else if (start3.startsWith("pt")) { start3 = "T" + start3.substring(2); } else if (start3.startsWith("ts")) { start3 = "S" + start3.substring(2); } else if (start3.startsWith("wr")) { start3 = "R" + start3.substring(2); } else if (start3.startsWith("x")) { start3 = "Z" + start3.substring(2); } } // Now keep the leading character, then do selected "soundalike" replacements. // The following letters are equated: AE, OE, E, U, Y and I; IA and A are equated; // K and C; Z and S; and H is dropped. // Also, A and O are equated, MAC and MC are equated, and SC and S. if (bufferLength > 1) { int upto = Character.isWhitespace(buffer[0]) ? 0 : 1; char c1 = ' '; for(int i=1; i<bufferLength; i++) { char c = buffer[i]; boolean skip = false; // replace all whitespace with spaces if (Character.isWhitespace(c)) { c = ' '; } else { switch (buffer[i]) { case 'a': if (c1 == 'i') { upto--; c='a'; } break; case 'c': if (c1 == 's') { skip = true; } break; case 'e': if (c1 == 'a' || c1 == 'o') { upto--; } c='i'; break; case 'i': if (c1 == 'o') { upto--; c='a'; } break; case 'o': c='a'; break; case 'u': c='i'; break; case 'y': c='i'; break; case 'k': c='c'; break; case 'z': c='c'; break; case 'h': skip=true; break; } } // drop any repeated characters (AA becomes A, BB or BBB becomes B, etc.) if (!skip && upto > 0 && buffer[upto-1] == c) { skip = true; } // remember original char c1 = buffer[i]; // alter buffer if (!skip) { buffer[upto++] = c; } } termAtt.setLength(upto); } //termAtt.setEmpty().append(treatWord(termAtt.toString(), false)); return true; } public static String treatWord(String str2, boolean isSpecies) { char startLetter; String temp = str2.toUpperCase(); // Do some selective replacement on the leading letter/s only: if (temp.startsWith("AE")) { temp = "E" + temp.substring(2); } else if (temp.startsWith("CN")) { temp = "N" + temp.substring(2); } else if (temp.startsWith("CT")) { temp = "T" + temp.substring(2); } else if (temp.startsWith("CZ")) { temp = "C" + temp.substring(2); } else if (temp.startsWith("DJ")) { temp = "J" + temp.substring(2); } else if (temp.startsWith("EA")) { temp = "E" + temp.substring(2); } else if (temp.startsWith("EU")) { temp = "U" + temp.substring(2); } else if (temp.startsWith("GN")) { temp = "N" + temp.substring(2); } else if (temp.startsWith("KN")) { temp = "N" + temp.substring(2); } else if (temp.startsWith("MC")) { temp = "MAC" + temp.substring(2); } else if (temp.startsWith("MN")) { temp = "N" + temp.substring(2); } else if (temp.startsWith("OE")) { temp = "E" + temp.substring(2); } else if (temp.startsWith("QU")) { temp = "Q" + temp.substring(2); } else if (temp.startsWith("PS")) { temp = "S" + temp.substring(2); } else if (temp.startsWith("PT")) { temp = "T" + temp.substring(2); } else if (temp.startsWith("TS")) { temp = "S" + temp.substring(2); } else if (temp.startsWith("WR")) { temp = "R" + temp.substring(2); } else if (temp.startsWith("X")) { temp = "Z" + temp.substring(2); } // Now keep the leading character, then do selected "soundalike" replacements. The // following letters are equated: AE, OE, E, U, Y and I; IA and A are equated; // K and C; Z and S; and H is dropped. Also, A and O are equated, MAC and MC are equated, and SC and S. startLetter = temp.charAt(0); // quarantine the leading letter temp = temp.substring(1); // snip off the leading letter // now do the replacements temp = temp.replaceAll("AE", "I"); temp = temp.replaceAll("IA", "A"); temp = temp.replaceAll("OE", "I"); temp = temp.replaceAll("OI", "A"); temp = temp.replaceAll("SC", "S"); temp = temp.replaceAll("E", "I"); temp = temp.replaceAll("O", "A"); temp = temp.replaceAll("U", "I"); temp = temp.replaceAll("Y", "I"); temp = temp.replaceAll("K", "C"); temp = temp.replaceAll("Z", "C"); temp = temp.replaceAll("H", ""); // add back the leading letter temp = startLetter + temp; // now drop any repeated characters (AA becomes A, BB or BBB becomes B, etc.) temp = temp.replaceAll("(\\w)\\1+", "$1"); if (isSpecies) { if (temp.endsWith("IS")) { temp = temp.substring(0, temp.length() - 2) + "A"; } else if (temp.endsWith("IM")) { temp = temp.substring(0, temp.length() - 2) + "A"; } else if (temp.endsWith("AS")) { temp = temp.substring(0, temp.length() - 2) + "A"; } //temp = temp.replaceAll("(\\w)\\1+", "$1"); } return temp; } }