package com.knowledgebooks.nlp.util; import java.util.HashSet; import java.util.Set; import com.knowledgebooks.public_domain.Stemmer; /** * Copyright Mark Watson 2008-2010. All Rights Reserved. * License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt) */ public class NoiseWords { private static String[] words = { "the", "a", "an", "it", "or", "and", "he", "she", "with", "often", "to", "do", "that", "this", "is", "are", "one", "two", "since", "just", "start", "beyond", "could", "not", "be", "from", "on", "could", "as", "say", "said", "will", "if", "by", "on", "often", "little", "big", "did", "do", "about", "any", "such", "up", "s", "already", "than", "now", "gave", "less", "more", "another", "for", "other", "goes", "would", "of", "her", "how", "told", "meet", "without", "few", "has", "ask", "run", "across", "rather", "me", "sometme", "want", "d", "look", "perhaps", "come", "o", "us", "m", "seem", "i", "u", "t", "what", "but", "last", "who", "toward", "when", "thing", "got", "can", "with", "at", "off", "in", "much", "under", "why", "also", "take", "am", "great", "in", "top", "see", "leaving" }; private static Set<String> stems = new HashSet<String>(); static { Stemmer stemmer = new Stemmer(); for (String word : words) { stems.add(stemmer.stemOneWord(word)); } } public static boolean checkFor(String stem) { return stems.contains(stem); } }