/* * * Copyright 2010, Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the * following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of conditions and the following * disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the * following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of * Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.talend.windowkey; import java.util.Iterator; import java.util.TreeSet; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; /** * code from google refine project FingerprintKeyer class. * * Minor changes done */ public class FingerprintKeyer { private static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}"); //$NON-NLS-1$ public String key(String str) { String s = str.trim(); // first off, remove whitespace around the string s = s.toLowerCase(); // then lowercase it s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars //$NON-NLS-1$ String[] frags = StringUtils.split(s); // split by whitespace TreeSet<String> set = new TreeSet<String>(); for (String ss : frags) { set.add(ss); // order fragments and dedupe } StringBuffer b = new StringBuffer(); Iterator<String> i = set.iterator(); while (i.hasNext()) { // join ordered fragments back together b.append(i.next()); b.append(' '); } return asciify(b.toString().trim()); // find ASCII equivalent to characters } protected String asciify(String s) { char[] c = s.toCharArray(); StringBuffer b = new StringBuffer(); for (char element : c) { b.append(translate(element)); } return b.toString(); } /** * Translate the given unicode char in the closest ASCII representation NOTE: this function deals only with latin-1 * supplement and latin-1 extended code charts */ private char translate(char c) { switch (c) { case '\u00C0': case '\u00C1': case '\u00C2': case '\u00C3': case '\u00C4': case '\u00C5': case '\u00E0': case '\u00E1': case '\u00E2': case '\u00E3': case '\u00E4': case '\u00E5': case '\u0100': case '\u0101': case '\u0102': case '\u0103': case '\u0104': case '\u0105': return 'a'; case '\u00C7': case '\u00E7': case '\u0106': case '\u0107': case '\u0108': case '\u0109': case '\u010A': case '\u010B': case '\u010C': case '\u010D': return 'c'; case '\u00D0': case '\u00F0': case '\u010E': case '\u010F': case '\u0110': case '\u0111': return 'd'; case '\u00C8': case '\u00C9': case '\u00CA': case '\u00CB': case '\u00E8': case '\u00E9': case '\u00EA': case '\u00EB': case '\u0112': case '\u0113': case '\u0114': case '\u0115': case '\u0116': case '\u0117': case '\u0118': case '\u0119': case '\u011A': case '\u011B': return 'e'; case '\u011C': case '\u011D': case '\u011E': case '\u011F': case '\u0120': case '\u0121': case '\u0122': case '\u0123': return 'g'; case '\u0124': case '\u0125': case '\u0126': case '\u0127': return 'h'; case '\u00CC': case '\u00CD': case '\u00CE': case '\u00CF': case '\u00EC': case '\u00ED': case '\u00EE': case '\u00EF': case '\u0128': case '\u0129': case '\u012A': case '\u012B': case '\u012C': case '\u012D': case '\u012E': case '\u012F': case '\u0130': case '\u0131': return 'i'; case '\u0134': case '\u0135': return 'j'; case '\u0136': case '\u0137': case '\u0138': return 'k'; case '\u0139': case '\u013A': case '\u013B': case '\u013C': case '\u013D': case '\u013E': case '\u013F': case '\u0140': case '\u0141': case '\u0142': return 'l'; case '\u00D1': case '\u00F1': case '\u0143': case '\u0144': case '\u0145': case '\u0146': case '\u0147': case '\u0148': case '\u0149': case '\u014A': case '\u014B': return 'n'; case '\u00D2': case '\u00D3': case '\u00D4': case '\u00D5': case '\u00D6': case '\u00D8': case '\u00F2': case '\u00F3': case '\u00F4': case '\u00F5': case '\u00F6': case '\u00F8': case '\u014C': case '\u014D': case '\u014E': case '\u014F': case '\u0150': case '\u0151': return 'o'; case '\u0154': case '\u0155': case '\u0156': case '\u0157': case '\u0158': case '\u0159': return 'r'; case '\u015A': case '\u015B': case '\u015C': case '\u015D': case '\u015E': case '\u015F': case '\u0160': case '\u0161': case '\u017F': return 's'; case '\u0162': case '\u0163': case '\u0164': case '\u0165': case '\u0166': case '\u0167': return 't'; case '\u00D9': case '\u00DA': case '\u00DB': case '\u00DC': case '\u00F9': case '\u00FA': case '\u00FB': case '\u00FC': case '\u0168': case '\u0169': case '\u016A': case '\u016B': case '\u016C': case '\u016D': case '\u016E': case '\u016F': case '\u0170': case '\u0171': case '\u0172': case '\u0173': return 'u'; case '\u0174': case '\u0175': return 'w'; case '\u00DD': case '\u00FD': case '\u00FF': case '\u0176': case '\u0177': case '\u0178': return 'y'; case '\u0179': case '\u017A': case '\u017B': case '\u017C': case '\u017D': case '\u017E': return 'z'; } return c; } }