/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.codec.language.bm; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; /** * Encodes strings into their Beider-Morse phonetic encoding. * <p> * Beider-Morse phonetic encodings are optimised for family names. However, they * may be useful for a wide range of words. * <p> * This encoder is intentionally mutable to allow dynamic configuration through * bean properties. As such, it is mutable, and may not be thread-safe. If you * require a guaranteed thread-safe encoding then use {@link PhoneticEngine} * directly. * <p> * <b>Encoding overview</b> * <p> * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of * rules is consulted to guess what language the word comes from. For example, * if it ends in "<code>ault</code>" then it infers that the word is French. * Next, the word is translated into a phonetic representation using a * language-specific phonetics table. Some runs of letters can be pronounced in * multiple ways, and a single run of letters may be potentially broken up into * phonemes at different places, so this stage results in a set of possible * language-specific phonetic representations. Lastly, this language-specific * phonetic representation is processed by a table of rules that re-writes it * phonetically taking into account systematic pronunciation differences between * languages, to move it towards a pan-indo-european phonetic representation. * Again, sometimes there are multiple ways this could be done and sometimes * things that can be pronounced in several ways in the source language have * only one way to represent them in this average phonetic language, so the * result is again a set of phonetic spellings. * <p> * Some names are treated as having multiple parts. This can be due to two * things. Firstly, they may be hyphenated. In this case, each individual * hyphenated word is encoded, and then these are combined end-to-end for the * final encoding. Secondly, some names have standard prefixes, for example, " * <code>Mac/Mc</code>" in Scottish (English) names. As sometimes it is * ambiguous whether the prefix is intended or is an accident of the spelling, * the word is encoded once with the prefix and once without it. The resulting * encoding contains one and then the other result. * <p> * <b>Encoding format</b> * <p> * Individual phonetic spellings of an input word are represented in upper- and * lower-case roman characters. Where there are multiple possible phonetic * representations, these are joined with a pipe (<code>|</code>) character. If * multiple hyphenated words where found, or if the word may contain a name * prefix, each encoded word is placed in elipses and these blocks are then * joined with hyphens. For example, "<code>d'ortley</code>" has a possible * prefix. The form without prefix encodes to "<code>ortlaj|ortlej</code>", * while the form with prefix encodes to "<code>dortlaj|dortlej</code> * ". Thus, the full, combined encoding is " * {@code (ortlaj|ortlej)-(dortlaj|dortlej)}". * <p> * The encoded forms are often quite a bit longer than the input strings. This * is because a single input may have many potential phonetic interpretations. * For example, "<code>Renault</code>" encodes to " * <code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The * <code>APPROX</code> rules will tend to produce larger encodings as they * consider a wider range of possible, approximate phonetic interpretations of * the original word. Down-stream applications may wish to further process the * encoding for indexing or lookup purposes, for example, by splitting on pipe ( * <code>|</code>) and indexing under each of these alternatives. * * @since 1.6 * @version $Id: BeiderMorseEncoder.java 1378746 2012-08-29 21:29:49Z tn $ */ public class BeiderMorseEncoder implements StringEncoder { // Implementation note: This class is a spring-friendly facade to // PhoneticEngine. It allows read/write configuration // of an immutable PhoneticEngine instance that will be delegated to for the // actual encoding. // a cached object private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true); @Override public Object encode(Object source) throws EncoderException { if (!(source instanceof String)) { throw new EncoderException( "BeiderMorseEncoder encode parameter is not of type String"); } return encode((String) source); } @Override public String encode(String source) throws EncoderException { if (source == null) { return null; } return this.engine.encode(source); } /** * Gets the name type currently in operation. * * @return the NameType currently being used */ public NameType getNameType() { return this.engine.getNameType(); } /** * Gets the rule type currently in operation. * * @return the RuleType currently being used */ public RuleType getRuleType() { return this.engine.getRuleType(); } /** * Discovers if multiple possible encodings are concatenated. * * @return true if multiple encodings are concatenated, false if just the * first one is returned */ public boolean isConcat() { return this.engine.isConcat(); } /** * Sets how multiple possible phonetic encodings are combined. * * @param concat * true if multiple encodings are to be combined with a '|', * false if just the first one is to be considered */ public void setConcat(boolean concat) { this.engine = new PhoneticEngine(this.engine.getNameType(), this.engine.getRuleType(), concat, this.engine.getMaxPhonemes()); } /** * Sets the type of name. Use {@link NameType#GENERIC} unless you * specifically want phonetic encodings optimized for Ashkenazi or Sephardic * Jewish family names. * * @param nameType * the NameType in use */ public void setNameType(NameType nameType) { this.engine = new PhoneticEngine(nameType, this.engine.getRuleType(), this.engine.isConcat(), this.engine.getMaxPhonemes()); } /** * Sets the rule type to apply. This will widen or narrow the range of * phonetic encodings considered. * * @param ruleType * {@link RuleType#APPROX} or {@link RuleType#EXACT} for * approximate or exact phonetic matches */ public void setRuleType(RuleType ruleType) { this.engine = new PhoneticEngine(this.engine.getNameType(), ruleType, this.engine.isConcat(), this.engine.getMaxPhonemes()); } /** * Sets the number of maximum of phonemes that shall be considered by the * engine. * * @param maxPhonemes * the maximum number of phonemes returned by the engine * @since 1.7 */ public void setMaxPhonemes(int maxPhonemes) { this.engine = new PhoneticEngine(this.engine.getNameType(), this.engine.getRuleType(), this.engine.isConcat(), maxPhonemes); } }