/* * This file is part of CoAnSys project. * Copyright (c) 2012-2015 ICM-UW * * CoAnSys is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * CoAnSys is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>. */ package pl.edu.icm.coansys.commons.stringsimilarity; /** * This code is derived from Fine-Grained Record Integration and Linkage Tool * written by Pawel Jurczyk (http://fril.sourceforge.net/) * * @author Artur Czeczko <a.czeczko@icm.edu.pl> */ public class JaroWinklerSimilarity extends SimilarityCalculator { private int maxPrefLength = 4; private float weight = 0.1f; @Override protected float doCalculate(String s1, String s2) { String _s1 = s1.toLowerCase(); String _s2 = s2.toLowerCase(); float dist = score(_s1, _s2); dist = dist + commonPrefix(_s1, _s2, maxPrefLength) * weight * (1.0f - dist); if (dist < 0) { dist = 0; } if (dist > 1.0f) { dist = 1.0f; } return dist; } private float score(String s1, String s2) { int limit = (s1.length() > s2.length()) ? s2.length() / 2 + 1 : s1.length() / 2 + 1; String c1 = commonChars(s1, s2, limit); String c2 = commonChars(s2, s1, limit); if ((c1.length() != c2.length()) || c1.length() == 0 || c2.length() == 0) { return 0; } int transpositions = transpositions(c1, c2); return (c1.length() / ((float) s1.length()) + c2.length() / ((float) s2.length()) + (c1.length() - transpositions) / ((float) c1.length())) / 3.0f; } private String commonChars(String s1, String s2, int limit) { StringBuilder common = new StringBuilder(); StringBuilder copy = new StringBuilder(s2); for (int i = 0; i < s1.length(); i++) { char ch = s1.charAt(i); boolean foundIt = false; for (int j = Math.max(0, i - limit); !foundIt && j < Math.min(i + limit, s2.length()); j++) { if (copy.charAt(j) == ch) { foundIt = true; common.append(ch); copy.setCharAt(j, '*'); } } } return common.toString(); } private int transpositions(String c1, String c2) { int transpositions = 0; for (int i = 0; i < c1.length(); i++) { if (c1.charAt(i) != c2.charAt(i)) { transpositions++; } } return transpositions / 2; } private static int commonPrefix(String c1, String c2, int maxPref) { int n = Math.min(maxPref, Math.min(c1.length(), c2.length())); for (int i = 0; i < n; i++) { if (c1.charAt(i) != c2.charAt(i)) { return i; } } return n; } }