package hu.u_szeged.utils;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Set;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.SzTECoreNLP;
public class Stopword implements Serializable {
public static final long serialVersionUID = 1L;
public static Set<String> stopwords = null;
public Stopword() {
if (stopwords != null) {
return;
}
stopwords = new HashSet<String>();
NLPUtils.readDocToCollection(System.getProperty("user.dir") + "/resources/stopwords/stopwords_" + SzTECoreNLP.lang + ".txt", stopwords,
Charset.forName("UTF-8"));
}
/**
* Returns true if the given string is a stop word.
*/
public boolean isStopword(String str) {
return stopwords.contains(str.toLowerCase());
}
public boolean isStopword(CoreLabel ew) {
String lemma = ew.get(LemmaAnnotation.class);
if (lemma == null) {
System.err.println("No lemma for token " + ew);
} else {
lemma = lemma.toLowerCase();
}
return isStopword(ew.word().toLowerCase()) || (lemma != null && isStopword(lemma));
}
}