/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.stopwordremover;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashSet;
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.SharedResourceObject;
/**
* Used for storing stop words in a HashSet. Can be used as resource object in UIMA. Terms in stop
* word files are converted to lower case.
*/
public class StopWordSet
implements SharedResourceObject
{
private HashSet<String> data;
public StopWordSet()
{
super();
data = new HashSet<String>();
}
public StopWordSet(String[] fileNames)
throws IOException
{
super();
data = new HashSet<String>();
for (String fileName : fileNames) {
addStopWordListFile(fileName);
}
}
/**
* Loads a text file (UTF-8 encoding!) containing stop words. Only first word in each line will
* be taken into account. Everything after "|" will be treated as comment.
*
* @param fileName
* the file to read.
* @throws IOException
* if the file could not be read.
*/
public void addStopWordListFile(String fileName)
throws IOException
{
try (Reader reader = new FileReader(fileName)) {
load(new FileReader(fileName));
}
}
@Override
public void load(DataResource dataRes)
throws ResourceInitializationException
{
try (InputStream is = dataRes.getInputStream()) {
load(is);
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
}
public void load(InputStream aIs)
throws IOException
{
load(aIs, "UTF-8");
}
public void load(InputStream aIs, String aEncoding)
throws IOException
{
load(new InputStreamReader(aIs, aEncoding));
}
/**
* Load the stopwords from the given reader. If the set already contains data, new stopwords are
* added.
*
* @param aReader
* a reader.
* @throws IOException
* if the data could not be read.
*/
public void load(Reader aReader)
throws IOException
{
String line = null;
BufferedReader br = new BufferedReader(aReader);
while ((line = br.readLine()) != null) {
String[] words = line.trim().split("\\s|\\|");
if (words.length > 0 && words[0].trim().length() > 0) {
data.add(words[0].toLowerCase());
}
}
}
public boolean contains(String aWord)
{
return data.contains(aWord);
}
}