//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.resources;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import uk.gov.dstl.baleen.uima.BaleenResource;
/**
* Shared resource for providing access to lists of common stop words.
*/
public class SharedStopwordResource extends BaleenResource {
private Set<String> stopwords = new HashSet<>();
/**
* The available stopword lists.
*
* <ul>
* <li>DEFAULT and SMART - SMART (Salton, 1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop</li>
* <li>FOX - "A stop list for general text" Fox 1989</li>
* <li>RANKS_NL - A list of stopwords, taken from http://www.ranks.nl/stopwords</li>
* <li>LONG - A long list of stopwords, taken from http://www.ranks.nl/stopwords</li>
* <li>MYSQL - The list of stop words used by the full text search in MySQL</li>
* </ul>
*/
public enum StopwordList{
DEFAULT,
SMART,
FOX,
RANKS_NL,
LONG,
MYSQL
}
@Override
protected boolean doInitialize(ResourceSpecifier specifier, Map<String, Object> additionalParams) throws ResourceInitializationException{
try{
stopwords = loadStoplist("SmartStoplist.txt");
}catch(IOException ioe){
getMonitor().error("Unable to read default stop words from SmartStoplist.txt", ioe);
throw new ResourceInitializationException(ioe);
}
return true;
}
@Override
protected void doDestroy() {
stopwords = null;
}
private Set<String> loadStoplist(String name) throws IOException{
Set<String> sw = new HashSet<>();
try(
InputStream is = getClass().getResourceAsStream("stoplists/"+name);
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
){
reader.lines().filter(s -> !s.startsWith("#")).forEach(s -> sw.add(s.trim().toLowerCase()));
}
return sw;
}
/**
* Returns a lower case list of stopwords loaded by this resource,
* using the default stopword list
*/
public Collection<String> getStopwords(){
return stopwords;
}
/**
* Returns a lower case list of stopwords loaded by this resource,
* using the specified stopword list
*/
public Collection<String> getStopwords(StopwordList list) throws IOException{
if (list == StopwordList.FOX){
return loadStoplist("FoxStoplist.txt");
}else if (list == StopwordList.RANKS_NL){
return loadStoplist("RanksNlStoplist.txt");
}else if (list == StopwordList.LONG){
return loadStoplist("LongStoplist.txt");
}else if (list == StopwordList.MYSQL){
return loadStoplist("MySqlStoplist.txt");
}
return stopwords;
}
/**
* Returns a lower case list of stopwords loaded by this resource,
* using a custom stopword list
*/
public Collection<String> getStopwords(File list) throws IOException{
Set<String> sw = new HashSet<>();
try(
InputStream is = new FileInputStream(list);
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
){
reader.lines().filter(s -> !s.startsWith("#") && s.trim().length() > 0).forEach(s -> sw.add(s.trim().toLowerCase()));
}
return sw;
}
}