/*
* (C) Copyright 2006-2011 Nuxeo SA (http://nuxeo.com/) and others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Contributors:
* Florent Guillaume
*/
package org.nuxeo.common.utils;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.regex.Pattern;
/**
* Functions related to simple fulltext parsing. They don't try to be exhaustive but they work for simple cases.
*/
public class FullTextUtils {
public static final Pattern wordPattern = Pattern.compile("[\\s\\p{Punct}]+");
public static final int MIN_SIZE = 3;
public static final String STOP_WORDS = "a an are and as at be by for from how "
+ "i in is it of on or that the this to was what when where who will with "
+ "car donc est il ils je la le les mais ni nous or ou pour tu un une vous " + "www com net org";
public static final Set<String> stopWords = new HashSet<String>(Arrays.asList(StringUtils.split(STOP_WORDS, ' ',
false)));
public static final String UNACCENTED = "aaaaaaaceeeeiiii\u00f0nooooo\u00f7ouuuuy\u00fey";
private FullTextUtils() {
// utility class
}
/**
* Extracts the words from a string for simple fulltext indexing.
* <p>
* Initial order is kept, but duplicate words are removed.
* <p>
* It omits short or stop words, removes accents and does pseudo-stemming.
*
* @param string the string
* @param removeDiacritics if the diacritics must be removed
* @return an ordered set of resulting words
*/
public static Set<String> parseFullText(String string, boolean removeDiacritics) {
if (string == null) {
return Collections.emptySet();
}
Set<String> set = new LinkedHashSet<String>();
for (String word : wordPattern.split(string)) {
String w = parseWord(word, removeDiacritics);
if (w != null) {
set.add(w);
}
}
return set;
}
/**
* Parses a word and returns a simplified lowercase form.
*
* @param string the word
* @param removeDiacritics if the diacritics must be removed
* @return the simplified word, or {@code null} if it was removed as a stop word or a short word
*/
public static String parseWord(String string, boolean removeDiacritics) {
int len = string.length();
if (len < MIN_SIZE) {
return null;
}
StringBuilder buf = new StringBuilder(len);
for (int i = 0; i < len; i++) {
char c = Character.toLowerCase(string.charAt(i));
if (removeDiacritics) {
if (c == '\u00e6') {
buf.append("ae");
} else if (c >= '\u00e0' && c <= '\u00ff') {
buf.append(UNACCENTED.charAt((c) - 0xe0));
} else if (c == '\u0153') {
buf.append("oe");
} else {
buf.append(c);
}
} else {
buf.append(c);
}
}
// simple heuristic to remove plurals
int l = buf.length();
if (l > 3 && buf.charAt(l - 1) == 's') {
buf.setLength(l - 1);
}
String word = buf.toString();
if (stopWords.contains(word)) {
return null;
}
return word;
}
}