package com.yahoo.glimmer.indexing;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import it.unimi.dsi.lang.MutableString;
import it.unimi.di.big.mg4j.index.TermProcessor;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
/**
* A term processor that excludes words on a stop list from being indexed.
*
*/
public class StopwordTermProcessor implements TermProcessor {
private static final long serialVersionUID = 1L;
private String fileName;
/** Blacklisted words **/
private static final String DEFAULT_BLACKLIST_FILENAME = "blacklist.txt";
private transient Set<String> blacklist = new HashSet<String>();
private static StopwordTermProcessor INSTANCE = null;
static {
try {
INSTANCE = new StopwordTermProcessor(DEFAULT_BLACKLIST_FILENAME);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
public StopwordTermProcessor(final String fileName) throws IOException, ClassNotFoundException {
this.fileName = fileName;
// Load blacklist
try {
// Loading from JAR
BufferedReader reader = new BufferedReader(new InputStreamReader(StopwordTermProcessor.class.getClassLoader().getResourceAsStream(fileName)));
String nextLine = "";
while ((nextLine = reader.readLine()) != null) {
blacklist.add(nextLine.trim());
}
reader.close();
} catch (Exception e) {
// Loading from file system
BufferedReader reader;
reader = new BufferedReader(new FileReader(fileName));
String nextLine = "";
while ((nextLine = reader.readLine()) != null) {
blacklist.add(nextLine.trim());
}
reader.close();
}
}
public final static TermProcessor getInstance() {
return INSTANCE;
}
public boolean processTerm(final MutableString term) {
if (term == null)
return false;
if (blacklist.contains(term.toLowerCase()))
return false;
return true;
}
public boolean processPrefix(final MutableString prefix) {
return processTerm(prefix);
}
private Object readResolve() {
return this;
}
public String toString() {
if (fileName == null) {
return this.getClass().getName();
} else {
return this.getClass().getName() + "(" + fileName + ")";
}
}
public String toSpec() {
return toString();
}
public StopwordTermProcessor copy() {
return this;
}
}