/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright 2, 2015nership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package eu.project.ttc.metrics; import java.text.Collator; import java.util.Locale; /** * The {@link Levenshtein} {@link EditDistance} insensitive to diacritics, i.e. * pairs of words such as <code>café</code> and <code>cafe</code>, * <code>joão</code> and <code>joao</code> will be considered to have a * <code>0</code> edit distance or <code>1</code> similarity. * * @author Sebastián Peña Saldarriaga */ public class DiacriticInsensitiveLevenshtein extends AbstractEditDistance{ public static int FastFailures = 0; /** Similarity threshold under which the distance is not computed anymore */ private double failThreshold = -1; /** Locale sensitive string comparator */ private Collator strCollator; public DiacriticInsensitiveLevenshtein(Locale locale) { super(); // Might be modified depending on the language strCollator = Collator.getInstance(locale); strCollator.setStrength(Collator.PRIMARY); } /** * Normalizes the specified <code>distance</code> by * <code>max(|str|, |rst|)</code>. For historical reasons this method * actually returns 1 - normalized distance, making a similarity. * * @param distance * The edit distance between <code>str</code> and * <code>rst</code>. * @param str * A string * @param rst * Another string * @return A [1, 0] value determined by * <code>1 - distance/max(|str|, |rst|)</code>. */ @Override public double normalize(int distance, String str, String rst) { return 1.0 - ((double) distance / Math.max(str.length(), rst.length())); } @Override public int compute(String str, String rst) { int l = Math.max(str.length(), rst.length()); int maxDistance = failThreshold == -1 ? Math.min(str.length(), rst.length()) : (int) Math.round((1 - failThreshold) * l); int[][] dp = new int[str.length() + 1][rst.length() + 1]; for (int i = 0; i < dp.length; i++) { int bestPossibleEditDistance = dp.length; for (int j = 0; j < dp[i].length; j++) { dp[i][j] = i == 0 ? j : j == 0 ? i : 0; if (i > 0 && j > 0) { if (diacriticInsensitiveEquals(str.charAt(i - 1), rst.charAt(j - 1))) { dp[i][j] = dp[i - 1][j - 1]; } else { dp[i][j] = Math.min(dp[i][j - 1] + 1, Math.min( dp[i - 1][j - 1] + 1, dp[i - 1][j] + 1)); } bestPossibleEditDistance = Math.min( bestPossibleEditDistance, dp[i][j]); } } // After calculating row i, look for the smallest value in a given // column. Abort is maxDistance is strictly exceeded if (i > maxDistance && bestPossibleEditDistance > maxDistance) { FastFailures++; return l; } } return dp[str.length()][rst.length()]; } /** * Determines whether <code>char1</code> and <code>char2</code> are equals * independent of the presence of diacritic marks. * * @param char1 * The first char * @param char2 * The second char * @return <code>true</code> if <code>char1</code> and <code>char2</code> * are equals, or <code>false</code> otherwise. */ public boolean diacriticInsensitiveEquals(char char1, char char2) { return strCollator.equals( toComparableStr(char1), toComparableStr(char2)); } private String toComparableStr(char char1) { return Character.toString(char1); } @Override public boolean isFailFast() { return true; } @Override public void setFailThreshold(double threshold) { failThreshold = threshold; } }