BeiderMorseEncoder.java example

Explorer
Area-master
- HorizontialListView.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language.bm;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

/**
 * Encodes strings into their Beider-Morse phonetic encoding.
 * <p>
 * Beider-Morse phonetic encodings are optimised for family names. However, they
 * may be useful for a wide range of words.
 * <p>
 * This encoder is intentionally mutable to allow dynamic configuration through
 * bean properties. As such, it is mutable, and may not be thread-safe. If you
 * require a guaranteed thread-safe encoding then use {@link PhoneticEngine}
 * directly.
 * <p>
 * <b>Encoding overview</b>
 * <p>
 * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of
 * rules is consulted to guess what language the word comes from. For example,
 * if it ends in "<code>ault</code>" then it infers that the word is French.
 * Next, the word is translated into a phonetic representation using a
 * language-specific phonetics table. Some runs of letters can be pronounced in
 * multiple ways, and a single run of letters may be potentially broken up into
 * phonemes at different places, so this stage results in a set of possible
 * language-specific phonetic representations. Lastly, this language-specific
 * phonetic representation is processed by a table of rules that re-writes it
 * phonetically taking into account systematic pronunciation differences between
 * languages, to move it towards a pan-indo-european phonetic representation.
 * Again, sometimes there are multiple ways this could be done and sometimes
 * things that can be pronounced in several ways in the source language have
 * only one way to represent them in this average phonetic language, so the
 * result is again a set of phonetic spellings.
 * <p>
 * Some names are treated as having multiple parts. This can be due to two
 * things. Firstly, they may be hyphenated. In this case, each individual
 * hyphenated word is encoded, and then these are combined end-to-end for the
 * final encoding. Secondly, some names have standard prefixes, for example, "
 * <code>Mac/Mc</code>" in Scottish (English) names. As sometimes it is
 * ambiguous whether the prefix is intended or is an accident of the spelling,
 * the word is encoded once with the prefix and once without it. The resulting
 * encoding contains one and then the other result.
 * <p>
 * <b>Encoding format</b>
 * <p>
 * Individual phonetic spellings of an input word are represented in upper- and
 * lower-case roman characters. Where there are multiple possible phonetic
 * representations, these are joined with a pipe (<code>|</code>) character. If
 * multiple hyphenated words where found, or if the word may contain a name
 * prefix, each encoded word is placed in elipses and these blocks are then
 * joined with hyphens. For example, "<code>d'ortley</code>" has a possible
 * prefix. The form without prefix encodes to "<code>ortlaj|ortlej</code>",
 * while the form with prefix encodes to "<code>dortlaj|dortlej</code>
 * ". Thus, the full, combined encoding is "
 * {@code (ortlaj|ortlej)-(dortlaj|dortlej)}".
 * <p>
 * The encoded forms are often quite a bit longer than the input strings. This
 * is because a single input may have many potential phonetic interpretations.
 * For example, "<code>Renault</code>" encodes to "
 * <code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The
 * <code>APPROX</code> rules will tend to produce larger encodings as they
 * consider a wider range of possible, approximate phonetic interpretations of
 * the original word. Down-stream applications may wish to further process the
 * encoding for indexing or lookup purposes, for example, by splitting on pipe (
 * <code>|</code>) and indexing under each of these alternatives.
 * 
 * @since 1.6
 * @version $Id: BeiderMorseEncoder.java 1378746 2012-08-29 21:29:49Z tn $
 */
public class BeiderMorseEncoder implements StringEncoder {
	// Implementation note: This class is a spring-friendly facade to
	// PhoneticEngine. It allows read/write configuration
	// of an immutable PhoneticEngine instance that will be delegated to for the
	// actual encoding.

	// a cached object
	private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC,
			RuleType.APPROX, true);

	@Override
	public Object encode(Object source) throws EncoderException {
		if (!(source instanceof String)) {
			throw new EncoderException(
					"BeiderMorseEncoder encode parameter is not of type String");
		}
		return encode((String) source);
	}

	@Override
	public String encode(String source) throws EncoderException {
		if (source == null) {
			return null;
		}
		return this.engine.encode(source);
	}

	/**
	 * Gets the name type currently in operation.
	 * 
	 * @return the NameType currently being used
	 */
	public NameType getNameType() {
		return this.engine.getNameType();
	}

	/**
	 * Gets the rule type currently in operation.
	 * 
	 * @return the RuleType currently being used
	 */
	public RuleType getRuleType() {
		return this.engine.getRuleType();
	}

	/**
	 * Discovers if multiple possible encodings are concatenated.
	 * 
	 * @return true if multiple encodings are concatenated, false if just the
	 *         first one is returned
	 */
	public boolean isConcat() {
		return this.engine.isConcat();
	}

	/**
	 * Sets how multiple possible phonetic encodings are combined.
	 * 
	 * @param concat
	 *            true if multiple encodings are to be combined with a '|',
	 *            false if just the first one is to be considered
	 */
	public void setConcat(boolean concat) {
		this.engine = new PhoneticEngine(this.engine.getNameType(),
				this.engine.getRuleType(), concat, this.engine.getMaxPhonemes());
	}

	/**
	 * Sets the type of name. Use {@link NameType#GENERIC} unless you
	 * specifically want phonetic encodings optimized for Ashkenazi or Sephardic
	 * Jewish family names.
	 * 
	 * @param nameType
	 *            the NameType in use
	 */
	public void setNameType(NameType nameType) {
		this.engine = new PhoneticEngine(nameType, this.engine.getRuleType(),
				this.engine.isConcat(), this.engine.getMaxPhonemes());
	}

	/**
	 * Sets the rule type to apply. This will widen or narrow the range of
	 * phonetic encodings considered.
	 * 
	 * @param ruleType
	 *            {@link RuleType#APPROX} or {@link RuleType#EXACT} for
	 *            approximate or exact phonetic matches
	 */
	public void setRuleType(RuleType ruleType) {
		this.engine = new PhoneticEngine(this.engine.getNameType(), ruleType,
				this.engine.isConcat(), this.engine.getMaxPhonemes());
	}

	/**
	 * Sets the number of maximum of phonemes that shall be considered by the
	 * engine.
	 * 
	 * @param maxPhonemes
	 *            the maximum number of phonemes returned by the engine
	 * @since 1.7
	 */
	public void setMaxPhonemes(int maxPhonemes) {
		this.engine = new PhoneticEngine(this.engine.getNameType(),
				this.engine.getRuleType(), this.engine.isConcat(), maxPhonemes);
	}

}