/*
* #!
* Ontopia Classify
* #-
* Copyright (C) 2001 - 2013 The Ontopia Project
* #-
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* !#
*/
package net.ontopia.topicmaps.classify;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import net.ontopia.utils.OntopiaRuntimeException;
/**
* INTERNAL: A set of words considered "stop words" in a particular
* language.
*/
public class StopList implements TermAnalyzerIF {
protected Collection<String> stopList;
protected double stopFactor = 0.0001d;
/**
* INTERNAL: Loads the stop list as a resource. The format of the
* stop list is a plain text file with one word per line.
*/
public StopList(String filename) {
ClassLoader cloader = StopList.class.getClassLoader();
if (cloader == null)
throw new OntopiaRuntimeException("Cannot find class loader.");
InputStream istream = cloader.getResourceAsStream(filename);
if (istream == null)
throw new OntopiaRuntimeException("Cannot find resource: " + filename);
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(istream));
try {
this.stopList = load(reader);
} finally {
reader.close();
}
} catch (IOException e) {
throw new OntopiaRuntimeException(e);
}
}
/**
* INTERNAL: Loads the stop list from a file. The format of the stop
* list is a plain text file with one word per line.
*/
public StopList(File file) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
try {
this.stopList = load(reader);
} finally {
reader.close();
}
} catch (IOException e) {
throw new OntopiaRuntimeException(e);
}
}
private Collection<String> load(BufferedReader reader) throws IOException {
Collection<String> stopList = new HashSet<String>();
String line = null;
while ((line = reader.readLine()) != null) {
// downcase before adding to list
stopList.add(line.trim().toLowerCase());
}
return stopList;
}
public void setStopFactor(double stopFactor) {
this.stopFactor = stopFactor;
}
public boolean isStopWord(String word) {
return stopList.contains(word);
}
public void analyzeTerm(Term term) {
if (isStopWord(term.getStem()))
term.multiplyScore(stopFactor, "stoplist adjustment");
}
public void startAnalysis(TermDatabase tdb) {
}
public void endAnalysis() {
}
}