/** * Copyright (c) 2013 Oculus Info Inc. * http://www.oculusinfo.com/ * * Released under the MIT License. * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in * the Software without restriction, including without limitation the rights to * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is furnished to do * so, subject to the following conditions: * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ package spimedb.cluster.utils; import java.util.Collections; import java.util.Iterator; import java.util.TreeSet; import java.util.regex.Pattern; public class StringTools { // RegEx to find all punctuation and control characters minus tabs private static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]"); /*** * Computes a fingerprint for the string that can be used to cluster similar strings. * The fingerprint is computed by: * * remove leading and trailing whitespace * * change all chars to lowercase * * remove all punctuation and control characters * * split the string into whitespace-separated tokens * * sort the tokens and remove duplicates * * join the tokens back together * * normalize extended western characters to their ASCII representation (for example "gödel" → "godel") * @param str the string to fingerprint * @return the fingerprint string */ public static String fingerPrint(String str) { // remove surrounding whitespace, turn to lowercase and remove punctuation and ctrl chars String s = stripPunctAndCtrlChars(str.trim().toLowerCase()); // tokenize string using white space String[] tokens = s.split("\\s+"); // sort tokens and combine into new string TreeSet<String> set = new TreeSet<>(); Collections.addAll(set, tokens); StringBuilder fingerPrint = new StringBuilder(); Iterator<String> i = set.iterator(); while (i.hasNext()) { fingerPrint.append(i.next()); if (i.hasNext()) { fingerPrint.append(' '); } } // convert to ASCII representation and return return toASCII(fingerPrint.toString()); } /*** * Removes all punctuation and control characters (minus tabs) from the string * * @param s * @return */ public static String stripPunctAndCtrlChars(String s) { return punctctrl.matcher(s).replaceAll(""); } /*** * Converts a unicode string to ASCII respresentation * NOTE: this function deals only with latin-1 supplement and latin-1 extended code charts * * @param s * @return ASCII version of string */ public static String toASCII(String s) { char[] chars = s.toCharArray(); StringBuilder ascii = new StringBuilder(); for (char c : chars) { ascii.append(toASCII(c)); } return ascii.toString(); } /*** * Converts a unicode string to ASCII respresentation * * NOTE: this function deals only with latin-1 supplement and latin-1 extended code charts * * @param s * @return ASCII version of string */ public static char toASCII(char c) { switch(c) { case '\u00C0': case '\u00C1': case '\u00C2': case '\u00C3': case '\u00C4': case '\u00C5': case '\u00E0': case '\u00E1': case '\u00E2': case '\u00E3': case '\u00E4': case '\u00E5': case '\u0100': case '\u0101': case '\u0102': case '\u0103': case '\u0104': case '\u0105': return 'a'; case '\u00C7': case '\u00E7': case '\u0106': case '\u0107': case '\u0108': case '\u0109': case '\u010A': case '\u010B': case '\u010C': case '\u010D': return 'c'; case '\u00D0': case '\u00F0': case '\u010E': case '\u010F': case '\u0110': case '\u0111': return 'd'; case '\u00C8': case '\u00C9': case '\u00CA': case '\u00CB': case '\u00E8': case '\u00E9': case '\u00EA': case '\u00EB': case '\u0112': case '\u0113': case '\u0114': case '\u0115': case '\u0116': case '\u0117': case '\u0118': case '\u0119': case '\u011A': case '\u011B': return 'e'; case '\u011C': case '\u011D': case '\u011E': case '\u011F': case '\u0120': case '\u0121': case '\u0122': case '\u0123': return 'g'; case '\u0124': case '\u0125': case '\u0126': case '\u0127': return 'h'; case '\u00CC': case '\u00CD': case '\u00CE': case '\u00CF': case '\u00EC': case '\u00ED': case '\u00EE': case '\u00EF': case '\u0128': case '\u0129': case '\u012A': case '\u012B': case '\u012C': case '\u012D': case '\u012E': case '\u012F': case '\u0130': case '\u0131': return 'i'; case '\u0134': case '\u0135': return 'j'; case '\u0136': case '\u0137': case '\u0138': return 'k'; case '\u0139': case '\u013A': case '\u013B': case '\u013C': case '\u013D': case '\u013E': case '\u013F': case '\u0140': case '\u0141': case '\u0142': return 'l'; case '\u00D1': case '\u00F1': case '\u0143': case '\u0144': case '\u0145': case '\u0146': case '\u0147': case '\u0148': case '\u0149': case '\u014A': case '\u014B': return 'n'; case '\u00D2': case '\u00D3': case '\u00D4': case '\u00D5': case '\u00D6': case '\u00D8': case '\u00F2': case '\u00F3': case '\u00F4': case '\u00F5': case '\u00F6': case '\u00F8': case '\u014C': case '\u014D': case '\u014E': case '\u014F': case '\u0150': case '\u0151': return 'o'; case '\u0154': case '\u0155': case '\u0156': case '\u0157': case '\u0158': case '\u0159': return 'r'; case '\u015A': case '\u015B': case '\u015C': case '\u015D': case '\u015E': case '\u015F': case '\u0160': case '\u0161': case '\u017F': return 's'; case '\u0162': case '\u0163': case '\u0164': case '\u0165': case '\u0166': case '\u0167': return 't'; case '\u00D9': case '\u00DA': case '\u00DB': case '\u00DC': case '\u00F9': case '\u00FA': case '\u00FB': case '\u00FC': case '\u0168': case '\u0169': case '\u016A': case '\u016B': case '\u016C': case '\u016D': case '\u016E': case '\u016F': case '\u0170': case '\u0171': case '\u0172': case '\u0173': return 'u'; case '\u0174': case '\u0175': return 'w'; case '\u00DD': case '\u00FD': case '\u00FF': case '\u0176': case '\u0177': case '\u0178': return 'y'; case '\u0179': case '\u017A': case '\u017B': case '\u017C': case '\u017D': case '\u017E': return 'z'; } return c; } }