/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.analysis.phonetic; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; /** * Kölner Phonetik * * H.J. Postel, Die Kölner Phonetik. Ein Verfahren zu Identifizierung * von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten 19 (1969), 925-931 * * Algorithmus aus der Matching Toolbox von Rainer Schnell * Java-Programmierung von Jörg Reiher * * mit Änderungen von Jörg Prante * */ public class KoelnerPhonetik implements StringEncoder { private static final String[] POSTEL_VARIATIONS_PATTERNS = {"AUN", "OWN", "RB", "RW", "WSK", "RSK"}; private static final String[] POSTEL_VARIATIONS_REPLACEMENTS = {"OWN", "AUN", "RW", "RB", "RSK", "WSK"}; private Pattern[] variationsPatterns; private boolean primary = false; private final Set<Character> csz = new HashSet<>(Arrays.asList( 'C', 'S', 'Z')); private final Set<Character> ckq = new HashSet<>(Arrays.asList( 'C', 'K', 'Q')); private final Set<Character> aouhkxq = new HashSet<>(Arrays.asList( 'A', 'O', 'U', 'H', 'K', 'X', 'Q')); private final Set<Character> ahkloqrux = new HashSet<>(Arrays.asList( 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X')); /** * Constructor for Kölner Phonetik */ public KoelnerPhonetik() { init(); } public KoelnerPhonetik(boolean useOnlyPrimaryCode) { this(); this.primary = useOnlyPrimaryCode; } /** * Get variation patterns * * @return string array of variations */ protected String[] getPatterns() { return POSTEL_VARIATIONS_PATTERNS; } protected String[] getReplacements() { return POSTEL_VARIATIONS_REPLACEMENTS; } protected char getCode() { return '0'; } public double getRelativeValue(Object o1, Object o2) { String[] kopho1 = code(expandUmlauts(o1.toString().toUpperCase(Locale.GERMANY))); String[] kopho2 = code(expandUmlauts(o2.toString().toUpperCase(Locale.GERMANY))); for (int i = 0; i < kopho1.length; i++) { for (int ii = 0; ii < kopho2.length; ii++) { if (kopho1[i].equals(kopho2[ii])) { return 1; } } } return 0; } @Override public Object encode(Object str) throws EncoderException { return encode((String) str); } @Override public String encode(String str) throws EncoderException { if (str == null) return null; String[] s = code(str.toString()); StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length; i++) { sb.append(s[i]); if (i < s.length - 1) { sb.append('_'); } } return sb.toString(); } private void init() { this.variationsPatterns = new Pattern[getPatterns().length]; for (int i = 0; i < getPatterns().length; i++) { this.variationsPatterns[i] = Pattern.compile(getPatterns()[i]); } } private String[] code(String str) { List<String> parts = partition(str); String[] codes = new String[parts.size()]; int i = 0; for (String s : parts) { codes[i++] = substitute(s); } return codes; } private List<String> partition(String str) { String primaryForm = str; List<String> parts = new ArrayList<>(); parts.add(primaryForm.replaceAll("[^\\p{L}\\p{N}]", "")); if (!primary) { List<String> tmpParts = new ArrayList<>(Arrays.asList(str.split("[\\p{Z}\\p{C}\\p{P}]"))); int numberOfParts = tmpParts.size(); while (tmpParts.size() > 0) { StringBuilder part = new StringBuilder(); for (int i = 0; i < tmpParts.size(); i++) { part.append(tmpParts.get(i)); if (!(i + 1 == numberOfParts)) { parts.add(part.toString()); } } tmpParts.remove(0); } } List<String> variations = new ArrayList<>(); for (int i = 0; i < parts.size(); i++) { List<String> variation = getVariations(parts.get(i)); if (variation != null) { variations.addAll(variation); } } return variations; } private List<String> getVariations(String str) { int position = 0; List<String> variations = new ArrayList<>(); variations.add(""); while (position < str.length()) { int i = 0; int substPos = -1; while (substPos < position && i < getPatterns().length) { Matcher m = variationsPatterns[i].matcher(str); while (substPos < position && m.find()) { substPos = m.start(); } i++; } if (substPos >= position) { i--; List<String> varNew = new ArrayList<>(); String prevPart = str.substring(position, substPos); for (int ii = 0; ii < variations.size(); ii++) { String tmp = variations.get(ii); varNew.add(tmp.concat(prevPart + getReplacements()[i])); variations.set(ii, variations.get(ii) + prevPart + getPatterns()[i]); } variations.addAll(varNew); position = substPos + getPatterns()[i].length(); } else { for (int ii = 0; ii < variations.size(); ii++) { variations.set(ii, variations.get(ii) + str.substring(position, str.length())); } position = str.length(); } } return variations; } private String substitute(String str) { String s = expandUmlauts(str.toUpperCase(Locale.GERMAN)); s = removeSequences(s); StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char current = s.charAt(i); char next = i + 1 < s.length() ? s.charAt(i + 1) : '_'; char prev = i > 0 ? s.charAt(i - 1) : '_'; switch (current) { case 'A': case 'E': case 'I': case 'J': case 'Y': case 'O': case 'U': if (i == 0 || ((i == 1) && prev == 'H')) { sb.append(getCode()); } break; case 'P': sb.append(next == 'H' ? "33" : '1'); break; case 'B': sb.append('1'); break; case 'D': case 'T': sb.append(csz.contains(next) ? '8' : '2'); break; case 'F': case 'V': case 'W': sb.append('3'); break; case 'G': case 'K': case 'Q': sb.append('4'); break; case 'C': if (i == 0) { sb.append(ahkloqrux.contains(next) ? '4' : '8'); } else { sb.append(aouhkxq.contains(next) ? '4' : '8'); } if (sb.length() >= 2 && sb.charAt(sb.length() - 2) == '8') { sb.setCharAt(sb.length() - 1, '8'); } break; case 'X': sb.append(i < 1 || !ckq.contains(prev) ? "48" : '8'); break; case 'L': sb.append('5'); break; case 'M': case 'N': sb.append('6'); break; case 'R': sb.append('7'); break; case 'S': case 'Z': sb.append('8'); break; case 'H': break; } } s = sb.toString(); s = removeSequences(s); return s; } private String expandUmlauts(String str) { return str.replaceAll("\u00C4", "AE").replaceAll("\u00D6", "OE").replaceAll("\u00DC", "UE"); } private String removeSequences(String str) { if (str == null || str.length() == 0) { return ""; } int i = 0, j = 0; StringBuilder sb = new StringBuilder().append(str.charAt(i++)); char c; while (i < str.length()) { c = str.charAt(i); if (c != sb.charAt(j)) { sb.append(c); j++; } i++; } return sb.toString(); } }