/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.codec.language; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import org.apache.commons.codec.CharEncoding; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; /** * Encodes a string into a Daitch-Mokotoff Soundex value. * <p> * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. * </p> * <p> * The main differences compared to the other soundex variants are: * </p> * <ul> * <li>coded names are 6 digits long * <li>the initial character of the name is coded * <li>rules to encoded multi-character n-grams * <li>multiple possible encodings for the same name (branching) * </ul> * <p> * This implementation supports branching, depending on the used method: * <ul> * <li>{@link #encode(String)} - branching disabled, only the first code will be returned * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' * </ul> * <p> * Note: this implementation has additional branching rules compared to the original description of the algorithm. The * rules can be customized by overriding the default rules contained in the resource file * {@code org/apache/commons/codec/language/dmrules.txt}. * </p> * <p> * This class is thread-safe. * </p> * * @see Soundex * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> * * @version $Id: DaitchMokotoffSoundex.java 1636515 2014-11-04 07:32:24Z tn $ * @since 1.10 */ public class DaitchMokotoffSoundex implements StringEncoder { /** * Inner class representing a branch during DM soundex encoding. */ private static final class Branch { private final StringBuilder builder; private String cachedString; private String lastReplacement; private Branch() { builder = new StringBuilder(); lastReplacement = null; cachedString = null; } /** * Creates a new branch, identical to this branch. * * @return a new, identical branch */ public Branch createBranch() { final Branch branch = new Branch(); branch.builder.append(toString()); branch.lastReplacement = this.lastReplacement; return branch; } @Override public boolean equals(final Object other) { if (this == other) { return true; } if (!(other instanceof Branch)) { return false; } return toString().equals(((Branch) other).toString()); } /** * Finish this branch by appending '0's until the maximum code length has been reached. */ public void finish() { while (builder.length() < MAX_LENGTH) { builder.append('0'); cachedString = null; } } @Override public int hashCode() { return toString().hashCode(); } /** * Process the next replacement to be added to this branch. * * @param replacement * the next replacement to append * @param forceAppend * indicates if the default processing shall be overridden */ public void processNextReplacement(final String replacement, final boolean forceAppend) { final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; if (append && builder.length() < MAX_LENGTH) { builder.append(replacement); // remove all characters after the maximum length if (builder.length() > MAX_LENGTH) { builder.delete(MAX_LENGTH, builder.length()); } cachedString = null; } lastReplacement = replacement; } @Override public String toString() { if (cachedString == null) { cachedString = builder.toString(); } return cachedString; } } /** * Inner class for storing rules. */ private static final class Rule { private final String pattern; private final String[] replacementAtStart; private final String[] replacementBeforeVowel; private final String[] replacementDefault; protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, final String replacementDefault) { this.pattern = pattern; this.replacementAtStart = replacementAtStart.split("\\|"); this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); this.replacementDefault = replacementDefault.split("\\|"); } public int getPatternLength() { return pattern.length(); } public String[] getReplacements(final String context, final boolean atStart) { if (atStart) { return replacementAtStart; } final int nextIndex = getPatternLength(); final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; if (nextCharIsVowel) { return replacementBeforeVowel; } return replacementDefault; } private boolean isVowel(final char ch) { return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; } public boolean matches(final String context) { return context.startsWith(pattern); } @Override public String toString() { return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); } } private static final String COMMENT = "//"; private static final String DOUBLE_QUOTE = "\""; private static final String MULTILINE_COMMENT_END = "*/"; private static final String MULTILINE_COMMENT_START = "/*"; /** The resource file containing the replacement and folding rules */ private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; /** The code length of a DM soundex value. */ private static final int MAX_LENGTH = 6; /** Transformation rules indexed by the first character of their pattern. */ private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>(); /** Folding rules. */ private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>(); static { final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE); if (rulesIS == null) { throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE); } final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8); parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); scanner.close(); // sort RULES by pattern length in descending order for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { final List<Rule> ruleList = rule.getValue(); Collections.sort(ruleList, new Comparator<Rule>() { @Override public int compare(final Rule rule1, final Rule rule2) { return rule2.getPatternLength() - rule1.getPatternLength(); } }); } } private static void parseRules(final Scanner scanner, final String location, final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { int currentLine = 0; boolean inMultilineComment = false; while (scanner.hasNextLine()) { currentLine++; final String rawLine = scanner.nextLine(); String line = rawLine; if (inMultilineComment) { if (line.endsWith(MULTILINE_COMMENT_END)) { inMultilineComment = false; } continue; } if (line.startsWith(MULTILINE_COMMENT_START)) { inMultilineComment = true; } else { // discard comments final int cmtI = line.indexOf(COMMENT); if (cmtI >= 0) { line = line.substring(0, cmtI); } // trim leading-trailing whitespace line = line.trim(); if (line.length() == 0) { continue; // empty lines can be safely skipped } if (line.contains("=")) { // folding final String[] parts = line.split("="); if (parts.length != 2) { throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location); } else { final String leftCharacter = parts[0]; final String rightCharacter = parts[1]; if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { throw new IllegalArgumentException("Malformed folding statement - " + "patterns are not single characters: " + rawLine + " in " + location); } asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); } } else { // rule final String[] parts = line.split("\\s+"); if (parts.length != 4) { throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location); } else { try { final String pattern = stripQuotes(parts[0]); final String replacement1 = stripQuotes(parts[1]); final String replacement2 = stripQuotes(parts[2]); final String replacement3 = stripQuotes(parts[3]); final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); final char patternKey = r.pattern.charAt(0); List<Rule> rules = ruleMapping.get(patternKey); if (rules == null) { rules = new ArrayList<Rule>(); ruleMapping.put(patternKey, rules); } rules.add(r); } catch (final IllegalArgumentException e) { throw new IllegalStateException( "Problem parsing line '" + currentLine + "' in " + location, e); } } } } } } private static String stripQuotes(String str) { if (str.startsWith(DOUBLE_QUOTE)) { str = str.substring(1); } if (str.endsWith(DOUBLE_QUOTE)) { str = str.substring(0, str.length() - 1); } return str; } /** Whether to use ASCII folding prior to encoding. */ private final boolean folding; /** * Creates a new instance with ASCII-folding enabled. */ public DaitchMokotoffSoundex() { this(true); } /** * Creates a new instance. * <p> * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. * รจ -> e. * </p> * * @param folding * if ASCII-folding shall be performed before encoding */ public DaitchMokotoffSoundex(final boolean folding) { this.folding = folding; } /** * Performs a cleanup of the input string before the actual soundex transformation. * <p> * Removes all whitespace characters and performs ASCII folding if enabled. * </p> * * @param input * the input string to cleanup * @return a cleaned up string */ private String cleanup(final String input) { final StringBuilder sb = new StringBuilder(); for (char ch : input.toCharArray()) { if (Character.isWhitespace(ch)) { continue; } ch = Character.toLowerCase(ch); if (folding && FOLDINGS.containsKey(ch)) { ch = FOLDINGS.get(ch); } sb.append(ch); } return sb.toString(); } /** * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. * <p> * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an * EncoderException if the supplied object is not of type java.lang.String. * </p> * * @see #soundex(String) * * @param obj * Object to encode * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String * supplied. * @throws EncoderException * if the parameter supplied is not of type java.lang.String * @throws IllegalArgumentException * if a character is not mapped */ @Override public Object encode(final Object obj) throws EncoderException { if (!(obj instanceof String)) { throw new EncoderException( "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); } return encode((String) obj); } /** * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. * * @see #soundex(String) * * @param source * A String object to encode * @return A DM Soundex code corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */ @Override public String encode(final String source) { if (source == null) { return null; } return soundex(source, false)[0]; } /** * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. * <p> * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, * separated by '|'. * </p> * <p> * Example: the name "AUERBACH" is encoded as both * </p> * <ul> * <li>097400</li> * <li>097500</li> * </ul> * <p> * Thus the result will be "097400|097500". * </p> * * @param source * A String object to encode * @return A string containing a set of DM Soundex codes corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */ public String soundex(final String source) { final String[] branches = soundex(source, true); final StringBuilder sb = new StringBuilder(); int index = 0; for (final String branch : branches) { sb.append(branch); if (++index < branches.length) { sb.append('|'); } } return sb.toString(); } /** * Perform the actual DM Soundex algorithm on the input string. * * @param source * A String object to encode * @param branching * If branching shall be performed * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the * selected branching mode */ private String[] soundex(final String source, final boolean branching) { if (source == null) { return null; } final String input = cleanup(source); final Set<Branch> currentBranches = new LinkedHashSet<Branch>(); currentBranches.add(new Branch()); char lastChar = '\0'; for (int index = 0; index < input.length(); index++) { final char ch = input.charAt(index); // ignore whitespace inside a name if (Character.isWhitespace(ch)) { continue; } final String inputContext = input.substring(index); final List<Rule> rules = RULES.get(ch); if (rules == null) { continue; } // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access @SuppressWarnings("unchecked") final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST; for (final Rule rule : rules) { if (rule.matches(inputContext)) { if (branching) { nextBranches.clear(); } final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); final boolean branchingRequired = replacements.length > 1 && branching; for (final Branch branch : currentBranches) { for (final String nextReplacement : replacements) { // if we have multiple replacements, always create a new branch final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; // special rule: occurrences of mn or nm are treated differently final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); nextBranch.processNextReplacement(nextReplacement, force); if (branching) { nextBranches.add(nextBranch); } else { break; } } } if (branching) { currentBranches.clear(); currentBranches.addAll(nextBranches); } index += rule.getPatternLength() - 1; break; } } lastChar = ch; } final String[] result = new String[currentBranches.size()]; int index = 0; for (final Branch branch : currentBranches) { branch.finish(); result[index++] = branch.toString(); } return result; } }