package edu.umd.rhsmith.diads.tools.tfidf; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.HashSet; import java.util.Scanner; import java.util.Set; /** * An implementation of {@link TermCleaner} which discards terms that appear in * a given collection of stop-words. Also provides convenience methods for * loading such a list from an input file. * * @author rmachedo * */ public class StopWordsCleaner implements TermCleaner { private final HashSet<String> stopWords; /** * Creates a new <code>StopWordsCleaner</code> using the given * {@link Collection} of term strings as its filter set. Note that the * filter set is copied from the given collection, so altering the * collection after an instance is constructed with this method will not * affect the instance's filtering. * * @param stopWords * the {@link Collection} of term strings as the filter set of * this <code>StopWordsCleaner</code> */ public StopWordsCleaner(Collection<String> stopWords) { this.stopWords = new HashSet<String>(stopWords); } @Override public String clean(String analysisText) { if (stopWords.contains(analysisText)) { return null; } else { return analysisText; } } /** * * @param filename * the name of the file to load stop-words from * @return the resulting {@link Set} of stop-words * @throws IOException * if an i/o exception occurs while loading the file */ public static Set<String> stopWordsFromFile(String filename) throws IOException { return stopWordsFromFile(new File(filename)); } /** * * @param filename * the file to load stop-words from * @return the resulting {@link Set} of stop-words * @throws IOException * if an i/o exception occurs while loading the file */ public static Set<String> stopWordsFromFile(File file) throws IOException { Set<String> words = new HashSet<String>(); Scanner s = new Scanner(file); while (s.hasNextLine()) { String line = s.nextLine(); if (line.startsWith("#")) { continue; } line = line.trim(); words.add(line); } s.close(); return words; } }