/* * StopwordRemover.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.util.text; import java.io.*; import java.util.*; /** * This class provides moderate morphology. This involves cleaning the text using a TextCleaner then * removing all stopwords. */ public class StopwordRemover extends TextProcessor { HashSet<String> stopwords ; Cleaner cleaner = new Cleaner() ; /** * Initializes a newly created StopwordRemover with a list of stopwords contained within the given file. * The file must be in a format where each word is found on its own line. * * @param stopwordFile the file of stopwords * @throws IOException if there is a problem reading from the file of stopwords */ public StopwordRemover(File stopwordFile) throws IOException { stopwords = new HashSet<String>() ; BufferedReader input = new BufferedReader(new FileReader(stopwordFile)) ; String line ; while ((line = input.readLine()) != null) { String word = line.trim().toLowerCase() ; stopwords.add(word) ; } } /** * Initializes a newly created StopwordRemover with a list of stopwords contained within the HashSet. * * @param stopwords a HashSet of stopwords */ public StopwordRemover(HashSet<String> stopwords){ this.stopwords = stopwords ; } /** * Returns the processed version of the argument string. This involves * removing all stopwords, then cleaning each remaining term. * * @param text the string to be processed * @return the processed string */ @Override public String processText(String text) { String t = text ; String t2 = "" ; String[] terms = t.split(" ") ; for (String term : terms) { if (!stopwords.contains(term)) { t2 = t2 + cleaner.processText(term) + " "; } } return t2.trim() ; } }