/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.codec.language; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoderAbstractTest; import org.junit.Assert; import org.junit.Test; /** * Tests {@link DaitchMokotoffSoundex}. * <p> * Keep this file in UTF-8 encoding for proper Javadoc processing. * </p> * * @since 1.10 */ public class DaitchMokotoffSoundexTest extends StringEncoderAbstractTest<DaitchMokotoffSoundex> { @Override protected DaitchMokotoffSoundex createStringEncoder() { return new DaitchMokotoffSoundex(); } private String soundex(final String source) { return getStringEncoder().soundex(source); } private String encode(final String source) { return getStringEncoder().encode(source); } @Test public void testAccentedCharacterFolding() { Assert.assertEquals("294795", soundex("Straßburg")); Assert.assertEquals("294795", soundex("Strasburg")); Assert.assertEquals("095600", soundex("Éregon")); Assert.assertEquals("095600", soundex("Eregon")); } @Test public void testAdjacentCodes() { // AKSSOL // A-KS-S-O-L // 0-54-4---8 -> wrong // 0-54-----8 -> correct Assert.assertEquals("054800", soundex("AKSSOL")); // GERSCHFELD // G-E-RS-CH-F-E-L-D // 5--4/94-5/4-7-8-3 -> wrong // 5--4/94-5/--7-8-3 -> correct Assert.assertEquals("547830|545783|594783|594578", soundex("GERSCHFELD")); } public void testEncodeBasic() { // same as above, but without branching Assert.assertEquals("097400", encode("AUERBACH")); Assert.assertEquals("097400", encode("OHRBACH")); Assert.assertEquals("874400", encode("LIPSHITZ")); Assert.assertEquals("874400", encode("LIPPSZYC")); Assert.assertEquals("876450", encode("LEWINSKY")); Assert.assertEquals("876450", encode("LEVINSKI")); Assert.assertEquals("486740", encode("SZLAMAWICZ")); Assert.assertEquals("486740", encode("SHLAMOVITZ")); } @Test public void testEncodeIgnoreApostrophes() throws EncoderException { this.checkEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien", "OBri'en", "OBrie'n", "OBrien'" }); } /** * Test data from http://www.myatt.demon.co.uk/sxalg.htm * * @throws EncoderException */ @Test public void testEncodeIgnoreHyphens() throws EncoderException { this.checkEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH", "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" }); } @Test public void testEncodeIgnoreTrimmable() { Assert.assertEquals("746536", encode(" \t\n\r Washington \t\n\r ")); Assert.assertEquals("746536", encode("Washington")); } /** * Examples from http://www.jewishgen.org/infofiles/soundex.html */ @Test public void testSoundexBasic() { Assert.assertEquals("583600", soundex("GOLDEN")); Assert.assertEquals("087930", soundex("Alpert")); Assert.assertEquals("791900", soundex("Breuer")); Assert.assertEquals("579000", soundex("Haber")); Assert.assertEquals("665600", soundex("Mannheim")); Assert.assertEquals("664000", soundex("Mintz")); Assert.assertEquals("370000", soundex("Topf")); Assert.assertEquals("586660", soundex("Kleinmann")); Assert.assertEquals("769600", soundex("Ben Aron")); Assert.assertEquals("097400|097500", soundex("AUERBACH")); Assert.assertEquals("097400|097500", soundex("OHRBACH")); Assert.assertEquals("874400", soundex("LIPSHITZ")); Assert.assertEquals("874400|874500", soundex("LIPPSZYC")); Assert.assertEquals("876450", soundex("LEWINSKY")); Assert.assertEquals("876450", soundex("LEVINSKI")); Assert.assertEquals("486740", soundex("SZLAMAWICZ")); Assert.assertEquals("486740", soundex("SHLAMOVITZ")); } /** * Examples from http://www.avotaynu.com/soundex.htm */ @Test public void testSoundexBasic2() { Assert.assertEquals("467000|567000", soundex("Ceniow")); Assert.assertEquals("467000", soundex("Tsenyuv")); Assert.assertEquals("587400|587500", soundex("Holubica")); Assert.assertEquals("587400", soundex("Golubitsa")); Assert.assertEquals("746480|794648", soundex("Przemysl")); Assert.assertEquals("746480", soundex("Pshemeshil")); Assert.assertEquals("944744|944745|944754|944755|945744|945745|945754|945755", soundex("Rosochowaciec")); Assert.assertEquals("945744", soundex("Rosokhovatsets")); } /** * Examples from http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex */ @Test public void testSoundexBasic3() { Assert.assertEquals("734000|739400", soundex("Peters")); Assert.assertEquals("734600|739460", soundex("Peterson")); Assert.assertEquals("645740", soundex("Moskowitz")); Assert.assertEquals("645740", soundex("Moskovitz")); Assert.assertEquals("154600|145460|454600|445460", soundex("Jackson")); Assert.assertEquals("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464", soundex("Jackson-Jackson")); } @Test public void testSpecialRomanianCharacters() { Assert.assertEquals("364000|464000", soundex("ţamas")); // t-cedilla Assert.assertEquals("364000|464000", soundex("țamas")); // t-comma } }