package ca.pfv.spmf.algorithms.clustering.text_clusterer; import java.util.ArrayList; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ /** * This class serves the purpose of stop word removal from the attributes. * @author Sabarish Raghu * */ public class StopWordAnalyzer { ArrayList<String> stopWordList; /** * Initialize all the stopwords. Stopwords are the common words in * English which does not have much significance in clustering * */ public StopWordAnalyzer() { stopWordList=new ArrayList<String>(); stopWordList.add("a"); stopWordList.add("able"); stopWordList.add("about"); stopWordList.add("above"); stopWordList.add("according"); stopWordList.add("accordingly"); stopWordList.add("across"); stopWordList.add("actually"); stopWordList.add("after"); stopWordList.add("afterwards"); stopWordList.add("again"); stopWordList.add("against"); stopWordList.add("ain't"); stopWordList.add("all"); stopWordList.add("allow"); stopWordList.add("allows"); stopWordList.add("almost"); stopWordList.add("alone"); stopWordList.add("along"); stopWordList.add("already"); stopWordList.add("also"); stopWordList.add("although"); stopWordList.add("always"); stopWordList.add("am"); stopWordList.add("among"); stopWordList.add("amongst"); stopWordList.add("an"); stopWordList.add("and"); stopWordList.add("another"); stopWordList.add("any"); stopWordList.add("anybody"); stopWordList.add("anyhow"); stopWordList.add("anyone"); stopWordList.add("anything"); stopWordList.add("anyway"); stopWordList.add("anyways"); stopWordList.add("anywhere"); stopWordList.add("apart"); stopWordList.add("appear"); stopWordList.add("appreciate"); stopWordList.add("appropriate"); stopWordList.add("are"); stopWordList.add("aren't"); stopWordList.add("around"); stopWordList.add("as"); stopWordList.add("aside"); stopWordList.add("ask"); stopWordList.add("asking"); stopWordList.add("associated"); stopWordList.add("at"); stopWordList.add("available"); stopWordList.add("away"); stopWordList.add("awfully"); stopWordList.add("be"); stopWordList.add("became"); stopWordList.add("because"); stopWordList.add("become"); stopWordList.add("becomes"); stopWordList.add("becoming"); stopWordList.add("been"); stopWordList.add("before"); stopWordList.add("beforehand"); stopWordList.add("behind"); stopWordList.add("being"); stopWordList.add("believe"); stopWordList.add("below"); stopWordList.add("beside"); stopWordList.add("besides"); stopWordList.add("best"); stopWordList.add("better"); stopWordList.add("between"); stopWordList.add("beyond"); stopWordList.add("both"); stopWordList.add("brief"); stopWordList.add("but"); stopWordList.add("by"); stopWordList.add("c'mon"); stopWordList.add("c's"); stopWordList.add("came"); stopWordList.add("can"); stopWordList.add("can't"); stopWordList.add("cannot"); stopWordList.add("cant"); stopWordList.add("cause"); stopWordList.add("causes"); stopWordList.add("certain"); stopWordList.add("certainly"); stopWordList.add("changes"); stopWordList.add("clearly"); stopWordList.add("co"); stopWordList.add("com"); stopWordList.add("come"); stopWordList.add("comes"); stopWordList.add("concerning"); stopWordList.add("consequently"); stopWordList.add("consider"); stopWordList.add("considering"); stopWordList.add("contain"); stopWordList.add("containing"); stopWordList.add("contains"); stopWordList.add("corresponding"); stopWordList.add("could"); stopWordList.add("couldn't"); stopWordList.add("course"); stopWordList.add("currently"); stopWordList.add("definitely"); stopWordList.add("described"); stopWordList.add("despite"); stopWordList.add("did"); stopWordList.add("didn't"); stopWordList.add("different"); stopWordList.add("do"); stopWordList.add("does"); stopWordList.add("doesn't"); stopWordList.add("doing"); stopWordList.add("don't"); stopWordList.add("done"); stopWordList.add("down"); stopWordList.add("downwards"); stopWordList.add("during"); stopWordList.add("each"); stopWordList.add("edu"); stopWordList.add("eg"); stopWordList.add("eight"); stopWordList.add("either"); stopWordList.add("else"); stopWordList.add("elsewhere"); stopWordList.add("enough"); stopWordList.add("entirely"); stopWordList.add("especially"); stopWordList.add("et"); stopWordList.add("etc"); stopWordList.add("even"); stopWordList.add("ever"); stopWordList.add("every"); stopWordList.add("everybody"); stopWordList.add("everyone"); stopWordList.add("everything"); stopWordList.add("everywhere"); stopWordList.add("ex"); stopWordList.add("exactly"); stopWordList.add("example"); stopWordList.add("except"); stopWordList.add("far"); stopWordList.add("few"); stopWordList.add("fifth"); stopWordList.add("first"); stopWordList.add("five"); stopWordList.add("followed"); stopWordList.add("following"); stopWordList.add("follows"); stopWordList.add("for"); stopWordList.add("former"); stopWordList.add("formerly"); stopWordList.add("forth"); stopWordList.add("four"); stopWordList.add("from"); stopWordList.add("further"); stopWordList.add("furthermore"); stopWordList.add("get"); stopWordList.add("gets"); stopWordList.add("getting"); stopWordList.add("given"); stopWordList.add("gives"); stopWordList.add("go"); stopWordList.add("goes"); stopWordList.add("going"); stopWordList.add("gone"); stopWordList.add("got"); stopWordList.add("gotten"); stopWordList.add("greetings"); stopWordList.add("had"); stopWordList.add("hadn't"); stopWordList.add("happens"); stopWordList.add("hardly"); stopWordList.add("has"); stopWordList.add("hasn't"); stopWordList.add("have"); stopWordList.add("haven't"); stopWordList.add("having"); stopWordList.add("he"); stopWordList.add("he's"); stopWordList.add("hello"); stopWordList.add("help"); stopWordList.add("hence"); stopWordList.add("her"); stopWordList.add("here"); stopWordList.add("here's"); stopWordList.add("hereafter"); stopWordList.add("hereby"); stopWordList.add("herein"); stopWordList.add("hereupon"); stopWordList.add("hers"); stopWordList.add("herself"); stopWordList.add("hi"); stopWordList.add("him"); stopWordList.add("himself"); stopWordList.add("his"); stopWordList.add("hither"); stopWordList.add("hopefully"); stopWordList.add("how"); stopWordList.add("howbeit"); stopWordList.add("however"); stopWordList.add("i'd"); stopWordList.add("i'll"); stopWordList.add("i'm"); stopWordList.add("i've"); stopWordList.add("ie"); stopWordList.add("if"); stopWordList.add("ignored"); stopWordList.add("immediate"); stopWordList.add("in"); stopWordList.add("inasmuch"); stopWordList.add("inc"); stopWordList.add("indeed"); stopWordList.add("indicate"); stopWordList.add("indicated"); stopWordList.add("indicates"); stopWordList.add("inner"); stopWordList.add("insofar"); stopWordList.add("instead"); stopWordList.add("into"); stopWordList.add("inward"); stopWordList.add("is"); stopWordList.add("isn't"); stopWordList.add("it"); stopWordList.add("it'd"); stopWordList.add("it'll"); stopWordList.add("it's"); stopWordList.add("its"); stopWordList.add("itself"); stopWordList.add("just"); stopWordList.add("keep"); stopWordList.add("keeps"); stopWordList.add("kept"); stopWordList.add("know"); stopWordList.add("known"); stopWordList.add("knows"); stopWordList.add("last"); stopWordList.add("lately"); stopWordList.add("later"); stopWordList.add("latter"); stopWordList.add("latterly"); stopWordList.add("least"); stopWordList.add("less"); stopWordList.add("lest"); stopWordList.add("let"); stopWordList.add("let's"); stopWordList.add("like"); stopWordList.add("liked"); stopWordList.add("likely"); stopWordList.add("little"); stopWordList.add("look"); stopWordList.add("looking"); stopWordList.add("looks"); stopWordList.add("ltd"); stopWordList.add("mainly"); stopWordList.add("many"); stopWordList.add("may"); stopWordList.add("maybe"); stopWordList.add("me"); stopWordList.add("mean"); stopWordList.add("meanwhile"); stopWordList.add("merely"); stopWordList.add("might"); stopWordList.add("more"); stopWordList.add("moreover"); stopWordList.add("most"); stopWordList.add("mostly"); stopWordList.add("much"); stopWordList.add("must"); stopWordList.add("my"); stopWordList.add("myself"); stopWordList.add("name"); stopWordList.add("namely"); stopWordList.add("nd"); stopWordList.add("near"); stopWordList.add("nearly"); stopWordList.add("necessary"); stopWordList.add("need"); stopWordList.add("needs"); stopWordList.add("neither"); stopWordList.add("never"); stopWordList.add("nevertheless"); stopWordList.add("new"); stopWordList.add("next"); stopWordList.add("nine"); stopWordList.add("no"); stopWordList.add("nobody"); stopWordList.add("non"); stopWordList.add("none"); stopWordList.add("noone"); stopWordList.add("nor"); stopWordList.add("normally"); stopWordList.add("not"); stopWordList.add("nothing"); stopWordList.add("novel"); stopWordList.add("now"); stopWordList.add("nowhere"); stopWordList.add("obviously"); stopWordList.add("of"); stopWordList.add("off"); stopWordList.add("often"); stopWordList.add("oh"); stopWordList.add("ok"); stopWordList.add("okay"); stopWordList.add("old"); stopWordList.add("on"); stopWordList.add("once"); stopWordList.add("one"); stopWordList.add("ones"); stopWordList.add("only"); stopWordList.add("onto"); stopWordList.add("or"); stopWordList.add("other"); stopWordList.add("others"); stopWordList.add("otherwise"); stopWordList.add("ought"); stopWordList.add("our"); stopWordList.add("ours"); stopWordList.add("ourselves"); stopWordList.add("out"); stopWordList.add("outside"); stopWordList.add("over"); stopWordList.add("overall"); stopWordList.add("own"); stopWordList.add("particular"); stopWordList.add("particularly"); stopWordList.add("per"); stopWordList.add("perhaps"); stopWordList.add("placed"); stopWordList.add("please"); stopWordList.add("plus"); stopWordList.add("possible"); stopWordList.add("presumably"); stopWordList.add("probably"); stopWordList.add("provides"); stopWordList.add("que"); stopWordList.add("quite"); stopWordList.add("qv"); stopWordList.add("rather"); stopWordList.add("rd"); stopWordList.add("re"); stopWordList.add("really"); stopWordList.add("reasonably"); stopWordList.add("regarding"); stopWordList.add("regardless"); stopWordList.add("regards"); stopWordList.add("relatively"); stopWordList.add("respectively"); stopWordList.add("right"); stopWordList.add("said"); stopWordList.add("same"); stopWordList.add("saw"); stopWordList.add("say"); stopWordList.add("saying"); stopWordList.add("says"); stopWordList.add("second"); stopWordList.add("secondly"); stopWordList.add("see"); stopWordList.add("seeing"); stopWordList.add("seem"); stopWordList.add("seemed"); stopWordList.add("seeming"); stopWordList.add("seems"); stopWordList.add("seen"); stopWordList.add("self"); stopWordList.add("selves"); stopWordList.add("sensible"); stopWordList.add("sent"); stopWordList.add("serious"); stopWordList.add("seriously"); stopWordList.add("seven"); stopWordList.add("several"); stopWordList.add("shall"); stopWordList.add("she"); stopWordList.add("should"); stopWordList.add("shouldn't"); stopWordList.add("since"); stopWordList.add("six"); stopWordList.add("so"); stopWordList.add("some"); stopWordList.add("somebody"); stopWordList.add("somehow"); stopWordList.add("someone"); stopWordList.add("something"); stopWordList.add("sometime"); stopWordList.add("sometimes"); stopWordList.add("somewhat"); stopWordList.add("somewhere"); stopWordList.add("soon"); stopWordList.add("sorry"); stopWordList.add("specified"); stopWordList.add("specify"); stopWordList.add("specifying"); stopWordList.add("still"); stopWordList.add("sub"); stopWordList.add("such"); stopWordList.add("sup"); stopWordList.add("sure"); stopWordList.add("t's"); stopWordList.add("take"); stopWordList.add("taken"); stopWordList.add("tell"); stopWordList.add("tends"); stopWordList.add("th"); stopWordList.add("than"); stopWordList.add("thank"); stopWordList.add("thanks"); stopWordList.add("thanx"); stopWordList.add("that"); stopWordList.add("that's"); stopWordList.add("thats"); stopWordList.add("the"); stopWordList.add("their"); stopWordList.add("theirs"); stopWordList.add("them"); stopWordList.add("themselves"); stopWordList.add("then"); stopWordList.add("thence"); stopWordList.add("there"); stopWordList.add("there's"); stopWordList.add("thereafter"); stopWordList.add("thereby"); stopWordList.add("therefore"); stopWordList.add("therein"); stopWordList.add("theres"); stopWordList.add("thereupon"); stopWordList.add("these"); stopWordList.add("they"); stopWordList.add("they'd"); stopWordList.add("they'll"); stopWordList.add("they're"); stopWordList.add("they've"); stopWordList.add("think"); stopWordList.add("third"); stopWordList.add("this"); stopWordList.add("thorough"); stopWordList.add("thoroughly"); stopWordList.add("those"); stopWordList.add("though"); stopWordList.add("three"); stopWordList.add("through"); stopWordList.add("throughout"); stopWordList.add("thru"); stopWordList.add("thus"); stopWordList.add("to"); stopWordList.add("together"); stopWordList.add("too"); stopWordList.add("took"); stopWordList.add("toward"); stopWordList.add("towards"); stopWordList.add("tried"); stopWordList.add("tries"); stopWordList.add("truly"); stopWordList.add("try"); stopWordList.add("trying"); stopWordList.add("twice"); stopWordList.add("two"); stopWordList.add("un"); stopWordList.add("under"); stopWordList.add("unfortunately"); stopWordList.add("unless"); stopWordList.add("unlikely"); stopWordList.add("until"); stopWordList.add("unto"); stopWordList.add("up"); stopWordList.add("upon"); stopWordList.add("us"); stopWordList.add("use"); stopWordList.add("used"); stopWordList.add("useful"); stopWordList.add("uses"); stopWordList.add("using"); stopWordList.add("usually"); stopWordList.add("value"); stopWordList.add("various"); stopWordList.add("very"); stopWordList.add("via"); stopWordList.add("viz"); stopWordList.add("vs"); stopWordList.add("want"); stopWordList.add("wants"); stopWordList.add("was"); stopWordList.add("wasn't"); stopWordList.add("way"); stopWordList.add("we"); stopWordList.add("we'd"); stopWordList.add("we'll"); stopWordList.add("we're"); stopWordList.add("we've"); stopWordList.add("welcome"); stopWordList.add("well"); stopWordList.add("went"); stopWordList.add("were"); stopWordList.add("weren't"); stopWordList.add("what"); stopWordList.add("what's"); stopWordList.add("whatever"); stopWordList.add("when"); stopWordList.add("whence"); stopWordList.add("whenever"); stopWordList.add("where"); stopWordList.add("where's"); stopWordList.add("whereafter"); stopWordList.add("whereas"); stopWordList.add("whereby"); stopWordList.add("wherein"); stopWordList.add("whereupon"); stopWordList.add("wherever"); stopWordList.add("whether"); stopWordList.add("which"); stopWordList.add("while"); stopWordList.add("whither"); stopWordList.add("who"); stopWordList.add("who's"); stopWordList.add("whoever"); stopWordList.add("whole"); stopWordList.add("whom"); stopWordList.add("whose"); stopWordList.add("why"); stopWordList.add("will"); stopWordList.add("willing"); stopWordList.add("wish"); stopWordList.add("with"); stopWordList.add("within"); stopWordList.add("without"); stopWordList.add("won't"); stopWordList.add("wonder"); stopWordList.add("would"); stopWordList.add("wouldn't"); stopWordList.add("yes"); stopWordList.add("yet"); stopWordList.add("you"); stopWordList.add("you'd"); stopWordList.add("you'll"); stopWordList.add("you're"); stopWordList.add("you've"); stopWordList.add("your"); stopWordList.add("yours"); stopWordList.add("yourself"); stopWordList.add("yourselves"); stopWordList.add("zero"); } /** * Initialize the user defined stopWordList */ public StopWordAnalyzer(ArrayList<String> stopWordList) { this.stopWordList=stopWordList; } /** * Removes all the stopWords in the record. * @param attribute * @return attribute/record with stopwords removed. */ public String removeStopWords(String attribute) { for(String stopWord:stopWordList) { attribute=attribute.replaceAll(stopWord, ""); } return attribute; } }