//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.resources.utils;
import java.util.Collection;
import java.util.StringJoiner;
import java.util.regex.Pattern;
/**
* Helper methods for working with stopwords
*/
public class StopwordUtils {
/**
* Private constructor as this is a helper class that shouldn't be instantiated
*/
private StopwordUtils(){
//Do nothing
}
/**
* Build a regular expression that matches any of the current list of stopwords,
* along with any additional terms provided.
*
* Any additional terms provided are not escaped, so you can provide your own regular
* expressions to include in the pattern.
*/
public static Pattern buildStopwordPattern(Collection<String> stopwords, Boolean caseSensitive, String... additionalTerms){
StringJoiner sj = new StringJoiner("|");
for(String s : stopwords){
sj.add(Pattern.quote(s));
}
if(additionalTerms != null){
for(String s : additionalTerms){
sj.add(s);
}
}
if(caseSensitive){
return Pattern.compile("\\b("+sj.toString()+")\\b");
}else{
return Pattern.compile("\\b("+sj.toString()+")\\b", Pattern.CASE_INSENSITIVE);
}
}
/**
* Returns true if word is a stopword
*/
public static boolean isStopWord(String word, Collection<String> stopwords, Boolean caseSensitive) {
if(!caseSensitive){
return stopwords.stream().filter(s -> s.equalsIgnoreCase(word)).count() >= 1;
}else{
return stopwords.contains(word);
}
}
}