/*
*
* Copyright 2010, Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following
* disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
* following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of
* Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.talend.windowkey;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.Pattern;
/**
* See http://code.google.com/p/google-refine/wiki/ClusteringInDepth
*/
public class NGramFingerprintKeyer extends FingerprintKeyer {
private static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}"); //$NON-NLS-1$
/*
* (non-Javadoc)
*
* @see org.talend.windowkey.FingerprintKeyer#key(java.lang.String)
*/
@Override
public String key(String str) {
// use bigrams
return this.key(str, 2);
}
public String key(String str, int ngramSize) {
String s = str.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars //$NON-NLS-1$
TreeSet<String> set = ngram_split(s, ngramSize);
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) { // join ordered fragments back together
b.append(i.next());
}
return asciify(b.toString()); // find ASCII equivalent to characters
}
protected TreeSet<String> ngram_split(String s, int size) {
TreeSet<String> set = new TreeSet<String>();
char[] chars = s.toCharArray();
for (int i = 0; i + size <= chars.length; i++) {
set.add(new String(chars, i, size));
}
return set;
}
}