/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // (FYI: Formatted and sorted with Eclipse) package org.apache.commons.codec.language; import org.junit.Assert; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoderAbstractTest; import org.junit.Test; /** * Tests {@link Soundex}. * * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p> * * @version $Id: SoundexTest.java 1437501 2013-01-23 15:51:21Z ggregory $ */ public class SoundexTest extends StringEncoderAbstractTest<Soundex> { @Override protected Soundex createStringEncoder() { return new Soundex(); } @Test public void testB650() throws EncoderException { this.checkEncodingVariations("B650", new String[]{ "BARHAM", "BARONE", "BARRON", "BERNA", "BIRNEY", "BIRNIE", "BOOROM", "BOREN", "BORN", "BOURN", "BOURNE", "BOWRON", "BRAIN", "BRAME", "BRANN", "BRAUN", "BREEN", "BRIEN", "BRIM", "BRIMM", "BRINN", "BRION", "BROOM", "BROOME", "BROWN", "BROWNE", "BRUEN", "BRUHN", "BRUIN", "BRUMM", "BRUN", "BRUNO", "BRYAN", "BURIAN", "BURN", "BURNEY", "BYRAM", "BYRNE", "BYRON", "BYRUM"}); } @Test public void testBadCharacters() { Assert.assertEquals("H452", this.getStringEncoder().encode("HOL>MES")); } @Test public void testDifference() throws EncoderException { // Edge cases Assert.assertEquals(0, this.getStringEncoder().difference(null, null)); Assert.assertEquals(0, this.getStringEncoder().difference("", "")); Assert.assertEquals(0, this.getStringEncoder().difference(" ", " ")); // Normal cases Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe")); Assert.assertEquals(2, this.getStringEncoder().difference("Ann", "Andrew")); Assert.assertEquals(1, this.getStringEncoder().difference("Margaret", "Andrew")); Assert.assertEquals(0, this.getStringEncoder().difference("Janet", "Margaret")); // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp Assert.assertEquals(4, this.getStringEncoder().difference("Green", "Greene")); Assert.assertEquals(0, this.getStringEncoder().difference("Blotchet-Halls", "Greene")); // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe")); Assert.assertEquals(4, this.getStringEncoder().difference("Smithers", "Smythers")); Assert.assertEquals(2, this.getStringEncoder().difference("Anothers", "Brothers")); } @Test public void testEncodeBasic() { Assert.assertEquals("T235", this.getStringEncoder().encode("testing")); Assert.assertEquals("T000", this.getStringEncoder().encode("The")); Assert.assertEquals("Q200", this.getStringEncoder().encode("quick")); Assert.assertEquals("B650", this.getStringEncoder().encode("brown")); Assert.assertEquals("F200", this.getStringEncoder().encode("fox")); Assert.assertEquals("J513", this.getStringEncoder().encode("jumped")); Assert.assertEquals("O160", this.getStringEncoder().encode("over")); Assert.assertEquals("T000", this.getStringEncoder().encode("the")); Assert.assertEquals("L200", this.getStringEncoder().encode("lazy")); Assert.assertEquals("D200", this.getStringEncoder().encode("dogs")); } /** * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html */ @Test public void testEncodeBatch2() { Assert.assertEquals("A462", this.getStringEncoder().encode("Allricht")); Assert.assertEquals("E166", this.getStringEncoder().encode("Eberhard")); Assert.assertEquals("E521", this.getStringEncoder().encode("Engebrethson")); Assert.assertEquals("H512", this.getStringEncoder().encode("Heimbach")); Assert.assertEquals("H524", this.getStringEncoder().encode("Hanselmann")); Assert.assertEquals("H431", this.getStringEncoder().encode("Hildebrand")); Assert.assertEquals("K152", this.getStringEncoder().encode("Kavanagh")); Assert.assertEquals("L530", this.getStringEncoder().encode("Lind")); Assert.assertEquals("L222", this.getStringEncoder().encode("Lukaschowsky")); Assert.assertEquals("M235", this.getStringEncoder().encode("McDonnell")); Assert.assertEquals("M200", this.getStringEncoder().encode("McGee")); Assert.assertEquals("O155", this.getStringEncoder().encode("Opnian")); Assert.assertEquals("O155", this.getStringEncoder().encode("Oppenheimer")); Assert.assertEquals("R355", this.getStringEncoder().encode("Riedemanas")); Assert.assertEquals("Z300", this.getStringEncoder().encode("Zita")); Assert.assertEquals("Z325", this.getStringEncoder().encode("Zitzmeinn")); } /** * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html */ @Test public void testEncodeBatch3() { Assert.assertEquals("W252", this.getStringEncoder().encode("Washington")); Assert.assertEquals("L000", this.getStringEncoder().encode("Lee")); Assert.assertEquals("G362", this.getStringEncoder().encode("Gutierrez")); Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister")); Assert.assertEquals("J250", this.getStringEncoder().encode("Jackson")); Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak")); // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also // possible. Assert.assertEquals("V532", this.getStringEncoder().encode("VanDeusen")); } /** * Examples from: http://www.myatt.demon.co.uk/sxalg.htm */ @Test public void testEncodeBatch4() { Assert.assertEquals("H452", this.getStringEncoder().encode("HOLMES")); Assert.assertEquals("A355", this.getStringEncoder().encode("ADOMOMI")); Assert.assertEquals("V536", this.getStringEncoder().encode("VONDERLEHR")); Assert.assertEquals("B400", this.getStringEncoder().encode("BALL")); Assert.assertEquals("S000", this.getStringEncoder().encode("SHAW")); Assert.assertEquals("J250", this.getStringEncoder().encode("JACKSON")); Assert.assertEquals("S545", this.getStringEncoder().encode("SCANLON")); Assert.assertEquals("S532", this.getStringEncoder().encode("SAINTJOHN")); } @Test public void testEncodeIgnoreApostrophes() throws EncoderException { this.checkEncodingVariations("O165", new String[]{ "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien", "OBri'en", "OBrie'n", "OBrien'"}); } /** * Test data from http://www.myatt.demon.co.uk/sxalg.htm * * @throws EncoderException */ @Test public void testEncodeIgnoreHyphens() throws EncoderException { this.checkEncodingVariations("K525", new String[]{ "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH", "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-"}); } @Test public void testEncodeIgnoreTrimmable() { Assert.assertEquals("W252", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r ")); } /** * Consonants from the same code group separated by W or H are treated as one. */ @Test public void testHWRuleEx1() { // From // http://www.archives.gov/research_room/genealogy/census/soundex.html: // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 // for the F). It is not coded A-226. Assert.assertEquals("A261", this.getStringEncoder().encode("Ashcraft")); } /** * Consonants from the same code group separated by W or H are treated as one. * * Test data from http://www.myatt.demon.co.uk/sxalg.htm */ @Test public void testHWRuleEx2() { Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTHDAVIS")); Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTH-DAVIS")); } /** * Consonants from the same code group separated by W or H are treated as one. * * @throws EncoderException */ @Test public void testHWRuleEx3() throws EncoderException { Assert.assertEquals("S460", this.getStringEncoder().encode("Sgler")); Assert.assertEquals("S460", this.getStringEncoder().encode("Swhgler")); // Also S460: this.checkEncodingVariations("S460", new String[]{ "SAILOR", "SALYER", "SAYLOR", "SCHALLER", "SCHELLER", "SCHILLER", "SCHOOLER", "SCHULER", "SCHUYLER", "SEILER", "SEYLER", "SHOLAR", "SHULER", "SILAR", "SILER", "SILLER"}); } /** * Examples for MS SQLServer from * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp */ @Test public void testMsSqlServer1() { Assert.assertEquals("S530", this.getStringEncoder().encode("Smith")); Assert.assertEquals("S530", this.getStringEncoder().encode("Smythe")); } /** * Examples for MS SQLServer from * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support * /kb/articles/Q100/3/65.asp&NoWebContent=1 * * @throws EncoderException */ @Test public void testMsSqlServer2() throws EncoderException { this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"}); } /** * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm */ @Test public void testMsSqlServer3() { Assert.assertEquals("A500", this.getStringEncoder().encode("Ann")); Assert.assertEquals("A536", this.getStringEncoder().encode("Andrew")); Assert.assertEquals("J530", this.getStringEncoder().encode("Janet")); Assert.assertEquals("M626", this.getStringEncoder().encode("Margaret")); Assert.assertEquals("S315", this.getStringEncoder().encode("Steven")); Assert.assertEquals("M240", this.getStringEncoder().encode("Michael")); Assert.assertEquals("R163", this.getStringEncoder().encode("Robert")); Assert.assertEquals("L600", this.getStringEncoder().encode("Laura")); Assert.assertEquals("A500", this.getStringEncoder().encode("Anne")); } /** * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 */ @Test public void testNewInstance() { Assert.assertEquals("W452", new Soundex().soundex("Williams")); } @Test public void testNewInstance2() { Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams")); } @Test public void testNewInstance3() { Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams")); } @Test public void testSoundexUtilsConstructable() { new SoundexUtils(); } @Test public void testSoundexUtilsNullBehaviour() { Assert.assertEquals(null, SoundexUtils.clean(null)); Assert.assertEquals("", SoundexUtils.clean("")); Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, "")); Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null)); } /** * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 */ @Test public void testUsEnglishStatic() { Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams")); } /** * Fancy characters are not mapped by the default US mapping. * * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 */ @Test public void testUsMappingEWithAcute() { Assert.assertEquals("E000", this.getStringEncoder().encode("e")); if (Character.isLetter('\u00e9')) { // e-acute try { // uppercase E-acute Assert.assertEquals("\u00c9000", this.getStringEncoder().encode("\u00e9")); Assert.fail("Expected IllegalArgumentException not thrown"); } catch (final IllegalArgumentException e) { // expected } } else { Assert.assertEquals("", this.getStringEncoder().encode("\u00e9")); } } /** * Fancy characters are not mapped by the default US mapping. * * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 */ @Test public void testUsMappingOWithDiaeresis() { Assert.assertEquals("O000", this.getStringEncoder().encode("o")); if (Character.isLetter('\u00f6')) { // o-umlaut try { // uppercase O-umlaut Assert.assertEquals("\u00d6000", this.getStringEncoder().encode("\u00f6")); Assert.fail("Expected IllegalArgumentException not thrown"); } catch (final IllegalArgumentException e) { // expected } } else { Assert.assertEquals("", this.getStringEncoder().encode("\u00f6")); } } }