/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.codec.language; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoderAbstractTest; import org.junit.Test; /** * Tests {@link DoubleMetaphone}. * * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p> * * @see "http://www.cuj.com/documents/s=8038/cuj0006philips/" * @version $Id: DoubleMetaphoneTest.java 1468715 2013-04-17 02:10:48Z ggregory $ */ public class DoubleMetaphoneTest extends StringEncoderAbstractTest<DoubleMetaphone> { /** * Test data from http://aspell.net/test/orig/batch0.tab. * * "Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org). Verbatim copying * and distribution of this entire article is permitted in any medium, * provided this notice is preserved." * * Massaged the test data in the array below. */ private static final String[][] FIXTURE = { { "Accosinly", "Occasionally" }, { "Ciculer", "Circler" }, { "Circue", "Circle" }, { "Maddness", "Madness" }, { "Occusionaly", "Occasionally" }, { "Steffen", "Stephen" }, { "Thw", "The" }, { "Unformanlly", "Unfortunately" }, { "Unfortally", "Unfortunately" }, { "abilitey", "ability" }, { "abouy", "about" }, { "absorbtion", "absorption" }, { "accidently", "accidentally" }, { "accomodate", "accommodate" }, { "acommadate", "accommodate" }, { "acord", "accord" }, { "adultry", "adultery" }, { "aggresive", "aggressive" }, { "alchohol", "alcohol" }, { "alchoholic", "alcoholic" }, { "allieve", "alive" }, { "alot", "a lot" }, { "alright", "all right" }, { "amature", "amateur" }, { "ambivilant", "ambivalent" }, { "amification", "amplification" }, { "amourfous", "amorphous" }, { "annoint", "anoint" }, { "annonsment", "announcement" }, { "annoyting", "anting" }, { "annuncio", "announce" }, { "anonomy", "anatomy" }, { "anotomy", "anatomy" }, { "antidesestablishmentarianism", "antidisestablishmentarianism" }, { "antidisestablishmentarism", "antidisestablishmentarianism" }, { "anynomous", "anonymous" }, { "appelet", "applet" }, { "appreceiated", "appreciated" }, { "appresteate", "appreciate" }, { "aquantance", "acquaintance" }, { "aratictature", "architecture" }, { "archeype", "archetype" }, { "aricticure", "architecture" }, { "artic", "arctic" }, { "asentote", "asymptote" }, { "ast", "at" }, { "asterick", "asterisk" }, { "asymetric", "asymmetric" }, { "atentively", "attentively" }, { "autoamlly", "automatically" }, { "bankrot", "bankrupt" }, { "basicly", "basically" }, { "batallion", "battalion" }, { "bbrose", "browse" }, { "beauro", "bureau" }, { "beaurocracy", "bureaucracy" }, { "beggining", "beginning" }, { "beging", "beginning" }, { "behaviour", "behavior" }, { "beleive", "believe" }, { "belive", "believe" }, { "benidifs", "benefits" }, { "bigginging", "beginning" }, { "blait", "bleat" }, { "bouyant", "buoyant" }, { "boygot", "boycott" }, { "brocolli", "broccoli" }, { "buch", "bush" }, { "buder", "butter" }, { "budr", "butter" }, { "budter", "butter" }, { "buracracy", "bureaucracy" }, { "burracracy", "bureaucracy" }, { "buton", "button" }, { "byby", "by by" }, { "cauler", "caller" }, { "ceasar", "caesar" }, { "cemetary", "cemetery" }, { "changeing", "changing" }, { "cheet", "cheat" }, { "cicle", "circle" }, { "cimplicity", "simplicity" }, { "circumstaces", "circumstances" }, { "clob", "club" }, { "coaln", "colon" }, { "cocamena", "cockamamie" }, { "colleaque", "colleague" }, { "colloquilism", "colloquialism" }, { "columne", "column" }, { "comiler", "compiler" }, { "comitmment", "commitment" }, { "comitte", "committee" }, { "comittmen", "commitment" }, { "comittmend", "commitment" }, { "commerciasl", "commercials" }, { "commited", "committed" }, { "commitee", "committee" }, { "companys", "companies" }, { "compicated", "complicated" }, { "comupter", "computer" }, { "concensus", "consensus" }, { "confusionism", "confucianism" }, { "congradulations", "congratulations" }, { "conibation", "contribution" }, { "consident", "consistent" }, { "consident", "consonant" }, { "contast", "constant" }, { "contastant", "constant" }, { "contunie", "continue" }, { "cooly", "coolly" }, { "copping", "coping" }, { "cosmoplyton", "cosmopolitan" }, { "courst", "court" }, { "crasy", "crazy" }, { "cravets", "caveats" }, { "credetability", "credibility" }, { "criqitue", "critique" }, { "croke", "croak" }, { "crucifiction", "crucifixion" }, { "crusifed", "crucified" }, { "ctitique", "critique" }, { "cumba", "combo" }, { "custamisation", "customization" }, { "dag", "dog" }, { "daly", "daily" }, { "danguages", "dangerous" }, { "deaft", "draft" }, { "defence", "defense" }, { "defenly", "defiantly" }, { "definate", "definite" }, { "definately", "definitely" }, { "dependeble", "dependable" }, { "descrption", "description" }, { "descrptn", "description" }, { "desparate", "desperate" }, { "dessicate", "desiccate" }, { "destint", "distant" }, { "develepment", "developments" }, { "developement", "development" }, { "develpond", "development" }, { "devulge", "divulge" }, { "diagree", "disagree" }, { "dieties", "deities" }, { "dinasaur", "dinosaur" }, { "dinasour", "dinosaur" }, { "direcyly", "directly" }, { "discuess", "discuss" }, { "disect", "dissect" }, { "disippate", "dissipate" }, { "disition", "decision" }, { "dispair", "despair" }, { "disssicion", "discussion" }, { "distarct", "distract" }, { "distart", "distort" }, { "distroy", "destroy" }, { "documtations", "documentation" }, { "doenload", "download" }, { "dongle", "dangle" }, { "doog", "dog" }, { "dramaticly", "dramatically" }, { "drunkeness", "drunkenness" }, { "ductioneery", "dictionary" }, { "dur", "due" }, { "duren", "during" }, { "dymatic", "dynamic" }, { "dynaic", "dynamic" }, { "ecstacy", "ecstasy" }, { "efficat", "efficient" }, { "efficity", "efficacy" }, { "effots", "efforts" }, { "egsistence", "existence" }, { "eitiology", "etiology" }, { "elagent", "elegant" }, { "elligit", "elegant" }, { "embarass", "embarrass" }, { "embarassment", "embarrassment" }, { "embaress", "embarrass" }, { "encapsualtion", "encapsulation" }, { "encyclapidia", "encyclopedia" }, { "encyclopia", "encyclopedia" }, { "engins", "engine" }, { "enhence", "enhance" }, { "enligtment", "Enlightenment" }, { "ennuui", "ennui" }, { "enought", "enough" }, { "enventions", "inventions" }, { "envireminakl", "environmental" }, { "enviroment", "environment" }, { "epitomy", "epitome" }, { "equire", "acquire" }, { "errara", "error" }, { "erro", "error" }, { "evaualtion", "evaluation" }, { "evething", "everything" }, { "evtually", "eventually" }, { "excede", "exceed" }, { "excercise", "exercise" }, { "excpt", "except" }, { "excution", "execution" }, { "exhileration", "exhilaration" }, { "existance", "existence" }, { "expleyly", "explicitly" }, { "explity", "explicitly" }, { "expresso", "espresso" }, { "exspidient", "expedient" }, { "extions", "extensions" }, { "factontion", "factorization" }, { "failer", "failure" }, { "famdasy", "fantasy" }, { "faver", "favor" }, { "faxe", "fax" }, { "febuary", "february" }, { "firey", "fiery" }, { "fistival", "festival" }, { "flatterring", "flattering" }, { "fluk", "flux" }, { "flukse", "flux" }, { "fone", "phone" }, { "forsee", "foresee" }, { "frustartaion", "frustrating" }, { "fuction", "function" }, { "funetik", "phonetic" }, { "futs", "guts" }, { "gamne", "came" }, { "gaurd", "guard" }, { "generly", "generally" }, { "ghandi", "gandhi" }, { "goberment", "government" }, { "gobernement", "government" }, { "gobernment", "government" }, { "gotton", "gotten" }, { "gracefull", "graceful" }, { "gradualy", "gradually" }, { "grammer", "grammar" }, { "hallo", "hello" }, { "hapily", "happily" }, { "harrass", "harass" }, { "havne", "have" }, { "heellp", "help" }, { "heighth", "height" }, { "hellp", "help" }, { "helo", "hello" }, { "herlo", "hello" }, { "hifin", "hyphen" }, { "hifine", "hyphen" }, { "higer", "higher" }, { "hiphine", "hyphen" }, { "hippie", "hippy" }, { "hippopotamous", "hippopotamus" }, { "hlp", "help" }, { "hourse", "horse" }, { "houssing", "housing" }, { "howaver", "however" }, { "howver", "however" }, { "humaniti", "humanity" }, { "hyfin", "hyphen" }, { "hypotathes", "hypothesis" }, { "hypotathese", "hypothesis" }, { "hystrical", "hysterical" }, { "ident", "indent" }, { "illegitament", "illegitimate" }, { "imbed", "embed" }, { "imediaetly", "immediately" }, { "imfamy", "infamy" }, { "immenant", "immanent" }, { "implemtes", "implements" }, { "inadvertant", "inadvertent" }, { "incase", "in case" }, { "incedious", "insidious" }, { "incompleet", "incomplete" }, { "incomplot", "incomplete" }, { "inconvenant", "inconvenient" }, { "inconvience", "inconvenience" }, { "independant", "independent" }, { "independenent", "independent" }, { "indepnends", "independent" }, { "indepth", "in depth" }, { "indispensible", "indispensable" }, { "inefficite", "inefficient" }, { "inerface", "interface" }, { "infact", "in fact" }, { "influencial", "influential" }, { "inital", "initial" }, { "initinized", "initialized" }, { "initized", "initialized" }, { "innoculate", "inoculate" }, { "insistant", "insistent" }, { "insistenet", "insistent" }, { "instulation", "installation" }, { "intealignt", "intelligent" }, { "intejilent", "intelligent" }, { "intelegent", "intelligent" }, { "intelegnent", "intelligent" }, { "intelejent", "intelligent" }, { "inteligent", "intelligent" }, { "intelignt", "intelligent" }, { "intellagant", "intelligent" }, { "intellegent", "intelligent" }, { "intellegint", "intelligent" }, { "intellgnt", "intelligent" }, { "intensionality", "intensionally" }, { "interate", "iterate" }, { "internation", "international" }, { "interpretate", "interpret" }, { "interpretter", "interpreter" }, { "intertes", "interested" }, { "intertesd", "interested" }, { "invermeantial", "environmental" }, { "irregardless", "regardless" }, { "irresistable", "irresistible" }, { "irritible", "irritable" }, { "islams", "muslims" }, { "isotrop", "isotope" }, { "isreal", "israel" }, { "johhn", "john" }, { "judgement", "judgment" }, { "kippur", "kipper" }, { "knawing", "knowing" }, { "latext", "latest" }, { "leasve", "leave" }, { "lesure", "leisure" }, { "liasion", "lesion" }, { "liason", "liaison" }, { "libary", "library" }, { "likly", "likely" }, { "lilometer", "kilometer" }, { "liquify", "liquefy" }, { "lloyer", "layer" }, { "lossing", "losing" }, { "luser", "laser" }, { "maintanence", "maintenance" }, { "majaerly", "majority" }, { "majoraly", "majority" }, { "maks", "masks" }, { "mandelbrot", "Mandelbrot" }, { "mant", "want" }, { "marshall", "marshal" }, { "maxium", "maximum" }, { "meory", "memory" }, { "metter", "better" }, { "mic", "mike" }, { "midia", "media" }, { "millenium", "millennium" }, { "miniscule", "minuscule" }, { "minkay", "monkey" }, { "minum", "minimum" }, { "mischievious", "mischievous" }, { "misilous", "miscellaneous" }, { "momento", "memento" }, { "monkay", "monkey" }, { "mosaik", "mosaic" }, { "mostlikely", "most likely" }, { "mousr", "mouser" }, { "mroe", "more" }, { "neccessary", "necessary" }, { "necesary", "necessary" }, { "necesser", "necessary" }, { "neice", "niece" }, { "neighbour", "neighbor" }, { "nemonic", "pneumonic" }, { "nevade", "Nevada" }, { "nickleodeon", "nickelodeon" }, { "nieve", "naive" }, { "noone", "no one" }, { "noticably", "noticeably" }, { "notin", "not in" }, { "nozled", "nuzzled" }, { "objectsion", "objects" }, { "obsfuscate", "obfuscate" }, { "ocassion", "occasion" }, { "occuppied", "occupied" }, { "occurence", "occurrence" }, { "octagenarian", "octogenarian" }, { "olf", "old" }, { "opposim", "opossum" }, { "organise", "organize" }, { "organiz", "organize" }, { "orientate", "orient" }, { "oscilascope", "oscilloscope" }, { "oving", "moving" }, { "paramers", "parameters" }, { "parametic", "parameter" }, { "paranets", "parameters" }, { "partrucal", "particular" }, { "pataphysical", "metaphysical" }, { "patten", "pattern" }, { "permissable", "permissible" }, { "permition", "permission" }, { "permmasivie", "permissive" }, { "perogative", "prerogative" }, { "persue", "pursue" }, { "phantasia", "fantasia" }, { "phenominal", "phenomenal" }, { "picaresque", "picturesque" }, { "playwrite", "playwright" }, { "poeses", "poesies" }, { "polation", "politician" }, { "poligamy", "polygamy" }, { "politict", "politic" }, { "pollice", "police" }, { "polypropalene", "polypropylene" }, { "pompom", "pompon" }, { "possable", "possible" }, { "practicle", "practical" }, { "pragmaticism", "pragmatism" }, { "preceeding", "preceding" }, { "precion", "precision" }, { "precios", "precision" }, { "preemptory", "peremptory" }, { "prefices", "prefixes" }, { "prefixt", "prefixed" }, { "presbyterian", "Presbyterian" }, { "presue", "pursue" }, { "presued", "pursued" }, { "privielage", "privilege" }, { "priviledge", "privilege" }, { "proceedures", "procedures" }, { "pronensiation", "pronunciation" }, { "pronisation", "pronunciation" }, { "pronounciation", "pronunciation" }, { "properally", "properly" }, { "proplematic", "problematic" }, { "protray", "portray" }, { "pscolgst", "psychologist" }, { "psicolagest", "psychologist" }, { "psycolagest", "psychologist" }, { "quoz", "quiz" }, { "radious", "radius" }, { "ramplily", "rampantly" }, { "reccomend", "recommend" }, { "reccona", "raccoon" }, { "recieve", "receive" }, { "reconise", "recognize" }, { "rectangeles", "rectangle" }, { "redign", "redesign" }, { "reoccurring", "recurring" }, { "repitition", "repetition" }, { "replasments", "replacement" }, { "reposable", "responsible" }, { "reseblence", "resemblance" }, { "respct", "respect" }, { "respecally", "respectfully" }, { "roon", "room" }, { "rought", "roughly" }, { "rsx", "RSX" }, { "rudemtry", "rudimentary" }, { "runnung", "running" }, { "sacreligious", "sacrilegious" }, { "saftly", "safely" }, { "salut", "salute" }, { "satifly", "satisfy" }, { "scrabdle", "scrabble" }, { "searcheable", "searchable" }, { "secion", "section" }, { "seferal", "several" }, { "segements", "segments" }, { "sence", "sense" }, { "seperate", "separate" }, { "sherbert", "sherbet" }, { "sicolagest", "psychologist" }, { "sieze", "seize" }, { "simpfilty", "simplicity" }, { "simplye", "simply" }, { "singal", "signal" }, { "sitte", "site" }, { "situration", "situation" }, { "slyph", "sylph" }, { "smil", "smile" }, { "snuck", "sneaked" }, { "sometmes", "sometimes" }, { "soonec", "sonic" }, { "specificialy", "specifically" }, { "spel", "spell" }, { "spoak", "spoke" }, { "sponsered", "sponsored" }, { "stering", "steering" }, { "straightjacket", "straitjacket" }, { "stumach", "stomach" }, { "stutent", "student" }, { "styleguide", "style guide" }, { "subisitions", "substitutions" }, { "subjecribed", "subscribed" }, { "subpena", "subpoena" }, { "substations", "substitutions" }, { "suger", "sugar" }, { "supercede", "supersede" }, { "superfulous", "superfluous" }, { "susan", "Susan" }, { "swimwear", "swim wear" }, { "syncorization", "synchronization" }, { "taff", "tough" }, { "taht", "that" }, { "tattos", "tattoos" }, { "techniquely", "technically" }, { "teh", "the" }, { "tem", "team" }, { "teo", "two" }, { "teridical", "theoretical" }, { "tesst", "test" }, { "tets", "tests" }, { "thanot", "than or" }, { "theirselves", "themselves" }, { "theridically", "theoretical" }, { "thredically", "theoretically" }, { "thruout", "throughout" }, { "ths", "this" }, { "titalate", "titillate" }, { "tobagan", "tobaggon" }, { "tommorrow", "tomorrow" }, { "tomorow", "tomorrow" }, { "tradegy", "tragedy" }, { "trubbel", "trouble" }, { "ttest", "test" }, { "tunnellike", "tunnel like" }, { "tured", "turned" }, { "tyrrany", "tyranny" }, { "unatourral", "unnatural" }, { "unaturral", "unnatural" }, { "unconisitional", "unconstitutional" }, { "unconscience", "unconscious" }, { "underladder", "under ladder" }, { "unentelegible", "unintelligible" }, { "unfortunently", "unfortunately" }, { "unnaturral", "unnatural" }, { "upcast", "up cast" }, { "upmost", "utmost" }, { "uranisium", "uranium" }, { "verison", "version" }, { "vinagarette", "vinaigrette" }, { "volumptuous", "voluptuous" }, { "volunteerism", "voluntarism" }, { "volye", "volley" }, { "wadting", "wasting" }, { "waite", "wait" }, { "wan't", "won't" }, { "warloord", "warlord" }, { "whaaat", "what" }, { "whard", "ward" }, { "whimp", "wimp" }, { "wicken", "weaken" }, { "wierd", "weird" }, { "wrank", "rank" }, { "writeen", "righten" }, { "writting", "writing" }, { "wundeews", "windows" }, { "yeild", "yield" }, { "youe", "your" } }; /** * A subset of FIXTURE generated by this test. */ private static final String[][] MATCHES = { { "Accosinly", "Occasionally" }, { "Maddness", "Madness" }, { "Occusionaly", "Occasionally" }, { "Steffen", "Stephen" }, { "Thw", "The" }, { "Unformanlly", "Unfortunately" }, { "Unfortally", "Unfortunately" }, { "abilitey", "ability" }, { "absorbtion", "absorption" }, { "accidently", "accidentally" }, { "accomodate", "accommodate" }, { "acommadate", "accommodate" }, { "acord", "accord" }, { "adultry", "adultery" }, { "aggresive", "aggressive" }, { "alchohol", "alcohol" }, { "alchoholic", "alcoholic" }, { "allieve", "alive" }, { "alot", "a lot" }, { "alright", "all right" }, { "amature", "amateur" }, { "ambivilant", "ambivalent" }, { "amourfous", "amorphous" }, { "annoint", "anoint" }, { "annonsment", "announcement" }, { "annoyting", "anting" }, { "annuncio", "announce" }, { "anotomy", "anatomy" }, { "antidesestablishmentarianism", "antidisestablishmentarianism" }, { "antidisestablishmentarism", "antidisestablishmentarianism" }, { "anynomous", "anonymous" }, { "appelet", "applet" }, { "appreceiated", "appreciated" }, { "appresteate", "appreciate" }, { "aquantance", "acquaintance" }, { "aricticure", "architecture" }, { "asterick", "asterisk" }, { "asymetric", "asymmetric" }, { "atentively", "attentively" }, { "bankrot", "bankrupt" }, { "basicly", "basically" }, { "batallion", "battalion" }, { "bbrose", "browse" }, { "beauro", "bureau" }, { "beaurocracy", "bureaucracy" }, { "beggining", "beginning" }, { "behaviour", "behavior" }, { "beleive", "believe" }, { "belive", "believe" }, { "blait", "bleat" }, { "bouyant", "buoyant" }, { "boygot", "boycott" }, { "brocolli", "broccoli" }, { "buder", "butter" }, { "budr", "butter" }, { "budter", "butter" }, { "buracracy", "bureaucracy" }, { "burracracy", "bureaucracy" }, { "buton", "button" }, { "byby", "by by" }, { "cauler", "caller" }, { "ceasar", "caesar" }, { "cemetary", "cemetery" }, { "changeing", "changing" }, { "cheet", "cheat" }, { "cimplicity", "simplicity" }, { "circumstaces", "circumstances" }, { "clob", "club" }, { "coaln", "colon" }, { "colleaque", "colleague" }, { "colloquilism", "colloquialism" }, { "columne", "column" }, { "comitmment", "commitment" }, { "comitte", "committee" }, { "comittmen", "commitment" }, { "comittmend", "commitment" }, { "commerciasl", "commercials" }, { "commited", "committed" }, { "commitee", "committee" }, { "companys", "companies" }, { "comupter", "computer" }, { "concensus", "consensus" }, { "confusionism", "confucianism" }, { "congradulations", "congratulations" }, { "contunie", "continue" }, { "cooly", "coolly" }, { "copping", "coping" }, { "cosmoplyton", "cosmopolitan" }, { "crasy", "crazy" }, { "croke", "croak" }, { "crucifiction", "crucifixion" }, { "crusifed", "crucified" }, { "cumba", "combo" }, { "custamisation", "customization" }, { "dag", "dog" }, { "daly", "daily" }, { "defence", "defense" }, { "definate", "definite" }, { "definately", "definitely" }, { "dependeble", "dependable" }, { "descrption", "description" }, { "descrptn", "description" }, { "desparate", "desperate" }, { "dessicate", "desiccate" }, { "destint", "distant" }, { "develepment", "developments" }, { "developement", "development" }, { "develpond", "development" }, { "devulge", "divulge" }, { "dieties", "deities" }, { "dinasaur", "dinosaur" }, { "dinasour", "dinosaur" }, { "discuess", "discuss" }, { "disect", "dissect" }, { "disippate", "dissipate" }, { "disition", "decision" }, { "dispair", "despair" }, { "distarct", "distract" }, { "distart", "distort" }, { "distroy", "destroy" }, { "doenload", "download" }, { "dongle", "dangle" }, { "doog", "dog" }, { "dramaticly", "dramatically" }, { "drunkeness", "drunkenness" }, { "ductioneery", "dictionary" }, { "ecstacy", "ecstasy" }, { "egsistence", "existence" }, { "eitiology", "etiology" }, { "elagent", "elegant" }, { "embarass", "embarrass" }, { "embarassment", "embarrassment" }, { "embaress", "embarrass" }, { "encapsualtion", "encapsulation" }, { "encyclapidia", "encyclopedia" }, { "encyclopia", "encyclopedia" }, { "engins", "engine" }, { "enhence", "enhance" }, { "ennuui", "ennui" }, { "enventions", "inventions" }, { "envireminakl", "environmental" }, { "enviroment", "environment" }, { "epitomy", "epitome" }, { "equire", "acquire" }, { "errara", "error" }, { "evaualtion", "evaluation" }, { "excede", "exceed" }, { "excercise", "exercise" }, { "excpt", "except" }, { "exhileration", "exhilaration" }, { "existance", "existence" }, { "expleyly", "explicitly" }, { "explity", "explicitly" }, { "failer", "failure" }, { "faver", "favor" }, { "faxe", "fax" }, { "firey", "fiery" }, { "fistival", "festival" }, { "flatterring", "flattering" }, { "flukse", "flux" }, { "fone", "phone" }, { "forsee", "foresee" }, { "frustartaion", "frustrating" }, { "funetik", "phonetic" }, { "gaurd", "guard" }, { "generly", "generally" }, { "ghandi", "gandhi" }, { "gotton", "gotten" }, { "gracefull", "graceful" }, { "gradualy", "gradually" }, { "grammer", "grammar" }, { "hallo", "hello" }, { "hapily", "happily" }, { "harrass", "harass" }, { "heellp", "help" }, { "heighth", "height" }, { "hellp", "help" }, { "helo", "hello" }, { "hifin", "hyphen" }, { "hifine", "hyphen" }, { "hiphine", "hyphen" }, { "hippie", "hippy" }, { "hippopotamous", "hippopotamus" }, { "hourse", "horse" }, { "houssing", "housing" }, { "howaver", "however" }, { "howver", "however" }, { "humaniti", "humanity" }, { "hyfin", "hyphen" }, { "hystrical", "hysterical" }, { "illegitament", "illegitimate" }, { "imbed", "embed" }, { "imediaetly", "immediately" }, { "immenant", "immanent" }, { "implemtes", "implements" }, { "inadvertant", "inadvertent" }, { "incase", "in case" }, { "incedious", "insidious" }, { "incompleet", "incomplete" }, { "incomplot", "incomplete" }, { "inconvenant", "inconvenient" }, { "inconvience", "inconvenience" }, { "independant", "independent" }, { "independenent", "independent" }, { "indepnends", "independent" }, { "indepth", "in depth" }, { "indispensible", "indispensable" }, { "inefficite", "inefficient" }, { "infact", "in fact" }, { "influencial", "influential" }, { "innoculate", "inoculate" }, { "insistant", "insistent" }, { "insistenet", "insistent" }, { "instulation", "installation" }, { "intealignt", "intelligent" }, { "intelegent", "intelligent" }, { "intelegnent", "intelligent" }, { "intelejent", "intelligent" }, { "inteligent", "intelligent" }, { "intelignt", "intelligent" }, { "intellagant", "intelligent" }, { "intellegent", "intelligent" }, { "intellegint", "intelligent" }, { "intellgnt", "intelligent" }, { "intensionality", "intensionally" }, { "internation", "international" }, { "interpretate", "interpret" }, { "interpretter", "interpreter" }, { "intertes", "interested" }, { "intertesd", "interested" }, { "invermeantial", "environmental" }, { "irresistable", "irresistible" }, { "irritible", "irritable" }, { "isreal", "israel" }, { "johhn", "john" }, { "kippur", "kipper" }, { "knawing", "knowing" }, { "lesure", "leisure" }, { "liasion", "lesion" }, { "liason", "liaison" }, { "likly", "likely" }, { "liquify", "liquefy" }, { "lloyer", "layer" }, { "lossing", "losing" }, { "luser", "laser" }, { "maintanence", "maintenance" }, { "mandelbrot", "Mandelbrot" }, { "marshall", "marshal" }, { "maxium", "maximum" }, { "mic", "mike" }, { "midia", "media" }, { "millenium", "millennium" }, { "miniscule", "minuscule" }, { "minkay", "monkey" }, { "mischievious", "mischievous" }, { "momento", "memento" }, { "monkay", "monkey" }, { "mosaik", "mosaic" }, { "mostlikely", "most likely" }, { "mousr", "mouser" }, { "mroe", "more" }, { "necesary", "necessary" }, { "necesser", "necessary" }, { "neice", "niece" }, { "neighbour", "neighbor" }, { "nemonic", "pneumonic" }, { "nevade", "Nevada" }, { "nickleodeon", "nickelodeon" }, { "nieve", "naive" }, { "noone", "no one" }, { "notin", "not in" }, { "nozled", "nuzzled" }, { "objectsion", "objects" }, { "ocassion", "occasion" }, { "occuppied", "occupied" }, { "occurence", "occurrence" }, { "octagenarian", "octogenarian" }, { "opposim", "opossum" }, { "organise", "organize" }, { "organiz", "organize" }, { "orientate", "orient" }, { "oscilascope", "oscilloscope" }, { "parametic", "parameter" }, { "permissable", "permissible" }, { "permmasivie", "permissive" }, { "persue", "pursue" }, { "phantasia", "fantasia" }, { "phenominal", "phenomenal" }, { "playwrite", "playwright" }, { "poeses", "poesies" }, { "poligamy", "polygamy" }, { "politict", "politic" }, { "pollice", "police" }, { "polypropalene", "polypropylene" }, { "possable", "possible" }, { "practicle", "practical" }, { "pragmaticism", "pragmatism" }, { "preceeding", "preceding" }, { "precios", "precision" }, { "preemptory", "peremptory" }, { "prefixt", "prefixed" }, { "presbyterian", "Presbyterian" }, { "presue", "pursue" }, { "presued", "pursued" }, { "privielage", "privilege" }, { "priviledge", "privilege" }, { "proceedures", "procedures" }, { "pronensiation", "pronunciation" }, { "pronounciation", "pronunciation" }, { "properally", "properly" }, { "proplematic", "problematic" }, { "protray", "portray" }, { "pscolgst", "psychologist" }, { "psicolagest", "psychologist" }, { "psycolagest", "psychologist" }, { "quoz", "quiz" }, { "radious", "radius" }, { "reccomend", "recommend" }, { "reccona", "raccoon" }, { "recieve", "receive" }, { "reconise", "recognize" }, { "rectangeles", "rectangle" }, { "reoccurring", "recurring" }, { "repitition", "repetition" }, { "replasments", "replacement" }, { "respct", "respect" }, { "respecally", "respectfully" }, { "rsx", "RSX" }, { "runnung", "running" }, { "sacreligious", "sacrilegious" }, { "salut", "salute" }, { "searcheable", "searchable" }, { "seferal", "several" }, { "segements", "segments" }, { "sence", "sense" }, { "seperate", "separate" }, { "sicolagest", "psychologist" }, { "sieze", "seize" }, { "simplye", "simply" }, { "sitte", "site" }, { "slyph", "sylph" }, { "smil", "smile" }, { "sometmes", "sometimes" }, { "soonec", "sonic" }, { "specificialy", "specifically" }, { "spel", "spell" }, { "spoak", "spoke" }, { "sponsered", "sponsored" }, { "stering", "steering" }, { "straightjacket", "straitjacket" }, { "stumach", "stomach" }, { "stutent", "student" }, { "styleguide", "style guide" }, { "subpena", "subpoena" }, { "substations", "substitutions" }, { "supercede", "supersede" }, { "superfulous", "superfluous" }, { "susan", "Susan" }, { "swimwear", "swim wear" }, { "syncorization", "synchronization" }, { "taff", "tough" }, { "taht", "that" }, { "tattos", "tattoos" }, { "techniquely", "technically" }, { "teh", "the" }, { "tem", "team" }, { "teo", "two" }, { "teridical", "theoretical" }, { "tesst", "test" }, { "theridically", "theoretical" }, { "thredically", "theoretically" }, { "thruout", "throughout" }, { "ths", "this" }, { "titalate", "titillate" }, { "tobagan", "tobaggon" }, { "tommorrow", "tomorrow" }, { "tomorow", "tomorrow" }, { "trubbel", "trouble" }, { "ttest", "test" }, { "tyrrany", "tyranny" }, { "unatourral", "unnatural" }, { "unaturral", "unnatural" }, { "unconisitional", "unconstitutional" }, { "unconscience", "unconscious" }, { "underladder", "under ladder" }, { "unentelegible", "unintelligible" }, { "unfortunently", "unfortunately" }, { "unnaturral", "unnatural" }, { "upcast", "up cast" }, { "verison", "version" }, { "vinagarette", "vinaigrette" }, { "volunteerism", "voluntarism" }, { "volye", "volley" }, { "waite", "wait" }, { "wan't", "won't" }, { "warloord", "warlord" }, { "whaaat", "what" }, { "whard", "ward" }, { "whimp", "wimp" }, { "wicken", "weaken" }, { "wierd", "weird" }, { "wrank", "rank" }, { "writeen", "righten" }, { "writting", "writing" }, { "wundeews", "windows" }, { "yeild", "yield" }, }; /** * Tests encoding APIs in one place. */ private void assertDoubleMetaphone(final String expected, final String source) { assertEquals(expected, this.getStringEncoder().encode(source)); try { assertEquals(expected, this.getStringEncoder().encode((Object) source)); } catch (final EncoderException e) { fail("Unexpected expection: " + e); } assertEquals(expected, this.getStringEncoder().doubleMetaphone(source)); assertEquals(expected, this.getStringEncoder().doubleMetaphone(source, false)); } /** * Tests encoding APIs in one place. */ public void assertDoubleMetaphoneAlt(final String expected, final String source) { assertEquals(expected, this.getStringEncoder().doubleMetaphone(source, true)); } public void doubleMetaphoneEqualTest(final String[][] pairs, final boolean useAlternate) { this.validateFixture(pairs); for (final String[] pair : pairs) { final String name0 = pair[0]; final String name1 = pair[1]; final String failMsg = "Expected match between " + name0 + " and " + name1 + " (use alternate: " + useAlternate + ")"; assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, useAlternate)); assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name1, name0, useAlternate)); if (!useAlternate) { assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1)); assertTrue(failMsg, this.getStringEncoder().isDoubleMetaphoneEqual(name1, name0)); } } } public void doubleMetaphoneNotEqualTest(final boolean alternate) { assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Brain", "Band", alternate)); assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Band", "Brain", alternate)); if (!alternate) { assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Brain", "Band")); assertFalse(this.getStringEncoder().isDoubleMetaphoneEqual("Band", "Brain")); } } @Override protected DoubleMetaphone createStringEncoder() { return new DoubleMetaphone(); } @Test public void testDoubleMetaphone() { assertDoubleMetaphone("TSTN", "testing"); assertDoubleMetaphone("0", "The"); assertDoubleMetaphone("KK", "quick"); assertDoubleMetaphone("PRN", "brown"); assertDoubleMetaphone("FKS", "fox"); assertDoubleMetaphone("JMPT", "jumped"); assertDoubleMetaphone("AFR", "over"); assertDoubleMetaphone("0", "the"); assertDoubleMetaphone("LS", "lazy"); assertDoubleMetaphone("TKS", "dogs"); assertDoubleMetaphone("MKFR", "MacCafferey"); assertDoubleMetaphone("STFN", "Stephan"); assertDoubleMetaphone("KSSK", "Kuczewski"); assertDoubleMetaphone("MKLL", "McClelland"); assertDoubleMetaphone("SNHS", "san jose"); assertDoubleMetaphone("SNFP", "xenophobia"); assertDoubleMetaphoneAlt("TSTN", "testing"); assertDoubleMetaphoneAlt("T", "The"); assertDoubleMetaphoneAlt("KK", "quick"); assertDoubleMetaphoneAlt("PRN", "brown"); assertDoubleMetaphoneAlt("FKS", "fox"); assertDoubleMetaphoneAlt("AMPT", "jumped"); assertDoubleMetaphoneAlt("AFR", "over"); assertDoubleMetaphoneAlt("T", "the"); assertDoubleMetaphoneAlt("LS", "lazy"); assertDoubleMetaphoneAlt("TKS", "dogs"); assertDoubleMetaphoneAlt("MKFR", "MacCafferey"); assertDoubleMetaphoneAlt("STFN", "Stephan"); assertDoubleMetaphoneAlt("KXFS", "Kutchefski"); assertDoubleMetaphoneAlt("MKLL", "McClelland"); assertDoubleMetaphoneAlt("SNHS", "san jose"); assertDoubleMetaphoneAlt("SNFP", "xenophobia"); assertDoubleMetaphoneAlt("FKR", "Fokker"); assertDoubleMetaphoneAlt("AK", "Joqqi"); assertDoubleMetaphoneAlt("HF", "Hovvi"); assertDoubleMetaphoneAlt("XRN", "Czerny"); } @Test public void testEmpty() { assertEquals(null, this.getStringEncoder().doubleMetaphone(null)); assertEquals(null, this.getStringEncoder().doubleMetaphone("")); assertEquals(null, this.getStringEncoder().doubleMetaphone(" ")); assertEquals(null, this.getStringEncoder().doubleMetaphone("\t\n\r ")); } /** * Test setting maximum length */ @Test public void testSetMaxCodeLength() { final String value = "jumped"; final DoubleMetaphone doubleMetaphone = new DoubleMetaphone(); // Sanity check of default settings assertEquals("Default Max Code Length", 4, doubleMetaphone.getMaxCodeLen()); assertEquals("Default Primary", "JMPT", doubleMetaphone.doubleMetaphone(value, false)); assertEquals("Default Alternate", "AMPT", doubleMetaphone.doubleMetaphone(value, true)); // Check setting Max Code Length doubleMetaphone.setMaxCodeLen(3); assertEquals("Set Max Code Length", 3, doubleMetaphone.getMaxCodeLen()); assertEquals("Max=3 Primary", "JMP", doubleMetaphone.doubleMetaphone(value, false)); assertEquals("Max=3 Alternate", "AMP", doubleMetaphone.doubleMetaphone(value, true)); } @Test public void testIsDoubleMetaphoneEqualBasic() { final String[][] testFixture = new String[][] { { "Case", "case" }, { "CASE", "Case" }, { "caSe", "cAsE" }, { "cookie", "quick" }, { "quick", "cookie" }, { "Brian", "Bryan" }, { "Auto", "Otto" }, { "Steven", "Stefan" }, { "Philipowitz", "Filipowicz" } }; doubleMetaphoneEqualTest(testFixture, false); doubleMetaphoneEqualTest(testFixture, true); } /** * Example in the original article but failures in this Java impl: */ @Test public void testIsDoubleMetaphoneEqualExtended1() { // String[][] testFixture = new String[][] { { "Smith", "Schmidt" } // }; // doubleMetaphoneEqualTest(testFixture, false); // doubleMetaphoneEqualTest(testFixture, true); } @Test public void testIsDoubleMetaphoneEqualExtended2() { final String[][] testFixture = new String[][] { { "Jablonski", "Yablonsky" } }; //doubleMetaphoneEqualTest(testFixture, false); doubleMetaphoneEqualTest(testFixture, true); } /** * Used to generate the MATCHES array and test possible matches from the * FIXTURE array. */ @Test public void testIsDoubleMetaphoneEqualExtended3() { this.validateFixture(FIXTURE); final StringBuilder failures = new StringBuilder(); final StringBuilder matches = new StringBuilder(); final String cr = System.getProperty("line.separator"); matches.append("private static final String[][] MATCHES = {" + cr); int failCount = 0; for (int i = 0; i < FIXTURE.length; i++) { final String name0 = FIXTURE[i][0]; final String name1 = FIXTURE[i][1]; final boolean match1 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, false); final boolean match2 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, true); if (match1 == false && match2 == false) { final String failMsg = "[" + i + "] " + name0 + " and " + name1 + cr; failures.append(failMsg); failCount++; } else { matches.append("{\"" + name0 + "\", \"" + name1 + "\"}," + cr); } } matches.append("};"); // Turn on to print a new MATCH array //System.out.println(matches.toString()); if (failCount > 0) { // Turn on to see which pairs do NOT match. // String msg = failures.toString(); //fail(failCount + " failures out of " + FIXTURE.length + ". The // following could be made to match: " + cr + msg); } } @Test public void testIsDoubleMetaphoneEqualWithMATCHES() { this.validateFixture(MATCHES); for (int i = 0; i < MATCHES.length; i++) { final String name0 = MATCHES[i][0]; final String name1 = MATCHES[i][1]; final boolean match1 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, false); final boolean match2 = this.getStringEncoder().isDoubleMetaphoneEqual(name0, name1, true); if (match1 == false && match2 == false) { fail("Expected match [" + i + "] " + name0 + " and " + name1); } } } @Test public void testIsDoubleMetaphoneNotEqual() { doubleMetaphoneNotEqualTest(false); doubleMetaphoneNotEqualTest(true); } @Test public void testCCedilla() { assertTrue(this.getStringEncoder().isDoubleMetaphoneEqual("\u00e7", "S")); // c-cedilla } @Test public void testNTilde() { assertTrue(this.getStringEncoder().isDoubleMetaphoneEqual("\u00f1", "N")); // n-tilde } public void validateFixture(final String[][] pairs) { if (pairs.length == 0) { fail("Test fixture is empty"); } for (int i = 0; i < pairs.length; i++) { if (pairs[i].length != 2) { fail("Error in test fixture in the data array at index " + i); } } } }