/* * File: DefaultStopList.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright March 09, 2009, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package gov.sandia.cognition.text.term.filter; import gov.sandia.cognition.text.term.Term; import gov.sandia.cognition.text.term.Termable; import gov.sandia.cognition.util.AbstractCloneableSerializable; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; import java.net.URI; import java.net.URL; import java.net.URLConnection; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Set; /** * A default, case-insensitive stop-list. * * @author Justin Basilico * @since 3.0 */ public class DefaultStopList extends AbstractCloneableSerializable implements StopList { /** The set of words in the stop list, all in lower-case. */ protected Set<String> words; /** * Creates a new, empty {@code DefaultStopList}. */ public DefaultStopList() { super(); this.setWords(new LinkedHashSet<String>()); } /** * Creates a new {@code DefaultStopList} with the given set of words. * * @param words * The words to add to the stop list. */ public DefaultStopList( final Iterable<String> words) { this(); this.addAll(words); } @Override public DefaultStopList clone() { final DefaultStopList clone = (DefaultStopList) super.clone(); clone.words = new LinkedHashSet<String>(this.words); return clone; } /** * Adds a word to the stop list. * * @param word * The word to add to the stop list. */ public void add( final String word) { this.words.add(word.toLowerCase()); } /** * Adds all of the given words to the stop list. * * @param words * The words to add. */ public void addAll( final Iterable<String> words) { for (String word : words) { this.add(word); } } public boolean contains( final Termable term) { return this.contains(term.asTerm()); } /** * Returns true if the given term is in the stop list. * * @param term * A term. * @return * True if the term is contained in the stop list. Otherwise, false. */ public boolean contains( final Term term) { final String text = term.getName(); return this.contains(text); } /** * Returns true if the given word is in the stop list. * * @param word * A word. * @return * True if the word is contained in the stop list. Otherwise, false. */ public boolean contains( final String word) { return this.words.contains(word.toLowerCase()); } /** * Gets the set of words in the stop list. * * @return * The set of words in the stop list. */ public Set<String> getWords() { return Collections.unmodifiableSet(this.words); } /** * Sets the set of words in the stop list. * * @param words * The set of words in the stop list. */ protected void setWords( final Set<String> words) { this.words = words; } /** * Saves the stop list to the given file. Each word is written on a * separate line. * * @param file * The file to save the stop list to. * @throws java.io.IOException * If there is an IO error. */ public void saveAsText( final File file) throws IOException { final PrintStream out = new PrintStream(file); try { this.saveAsText(out); } finally { out.close(); } } /** * Saves the stop list to the given stream. Each word is written on a * separate line. The stream is not closed at the end. * * @param out * The stream to write the stop words to. * @throws java.io.IOException * If there is an IO error. */ public void saveAsText( final PrintStream out) throws IOException { for (String word : this.getWords()) { out.println(word); } } /** * Loads a stop list by reading in a given file and treating each line as * a word. * * @param file * The file to read in. * @return * A new stop list containing a stop word for each line in the file. * @throws java.io.IOException * If there is an IO error. */ public static DefaultStopList loadFromText( final File file) throws IOException { return loadFromText(file.toURI()); } /** * Loads a stop list by reading in a given file and treating each line as * a word. * * @param uri * The file to read in. * @return * A new stop list containing a stop word for each line in the file. * @throws java.io.IOException * If there is an IO error. */ public static DefaultStopList loadFromText( final URI uri) throws IOException { return loadFromText(uri.toURL()); } /** * Loads a stop list by reading in a given file and treating each line as * a word. * * @param url * The file to read in. * @return * A new stop list containing a stop word for each line in the file. * @throws java.io.IOException * If there is an IO error. */ public static DefaultStopList loadFromText( final URL url) throws IOException { return loadFromText(url.openConnection()); } /** * Loads a stop list by reading in a given file and treating each line as * a word. * * @param connection * The connection to the file to read in. * @return * A new stop list containing a stop word for each line in the file. * @throws java.io.IOException * If there is an IO error. */ public static DefaultStopList loadFromText( final URLConnection connection) throws IOException { // Open a reader on the input stream. final BufferedReader reader = new BufferedReader(new InputStreamReader( connection.getInputStream())); try { // Attempt to load the stop list from the reader. return loadFromText(reader); } finally { reader.close(); } } /** * Loads a stop list by reading in from the given reader and treating each * line as a word. * * @param reader * The reader to read the stop words from. Does not close the reader. * @return * A new stop list containing a stop word for each line in the reader. * @throws java.io.IOException * If there is an IO error. */ public static DefaultStopList loadFromText( final BufferedReader reader) throws IOException { // Create the stop list to hold the result. final DefaultStopList result = new DefaultStopList(); // Read through each line. String line = null; while ((line = reader.readLine()) != null) { line = line.trim(); if (line.length() > 0) { result.add(line); } } // We've read through the whole reader so return the result. return result; } }