package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
/**
* Loader for text files that represent a list of stopwords.
*/
public class WordlistLoader {
/**
* Loads a text file associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
* the file should contain only one word. The words need to be in lower-case if
* you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param aClass
* a class that is associated with the given stopwordResource
* @param stopwordResource
* name of the resource file associated with the given class
* @return a {@link Set} with the file's words
*/
public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
try {
return getWordSet(reader);
} finally {
reader.close();
}
}
/**
* Loads a text file associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
* the file should contain only one word. The words need to be in lower-case if
* you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param aClass
* a class that is associated with the given stopwordResource
* @param stopwordResource
* name of the resource file associated with the given class
* @param comment
* the comment string to ignore
* @return a {@link Set} with the file's words
*/
public static Set<String> getWordSet(Class<?> aClass,
String stopwordResource, String comment) throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
try {
return getWordSet(reader, comment);
} finally {
reader.close();
}
}
/**
* Loads a text file and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param wordfile File containing the wordlist
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile) throws IOException {
FileReader reader = null;
try {
reader = new FileReader(wordfile);
return getWordSet(reader);
}
finally {
if (reader != null)
reader.close();
}
}
/**
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param wordfile File containing the wordlist
* @param comment The comment string to ignore
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
FileReader reader = null;
try {
reader = new FileReader(wordfile);
return getWordSet(reader, comment);
}
finally {
if (reader != null)
reader.close();
}
}
/**
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @return A HashSet with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader) throws IOException {
final HashSet<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String word = null;
while ((word = br.readLine()) != null) {
result.add(word.trim());
}
}
finally {
if (br != null)
br.close();
}
return result;
}
/**
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
* @return A HashSet with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
final HashSet<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false){
result.add(word.trim());
}
}
}
finally {
if (br != null)
br.close();
}
return result;
}
/**
* Loads a text file in Snowball format associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds all words as entries to
* a {@link Set}. The words need to be in lower-case if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param aClass a class that is associated with the given stopwordResource
* @param stopwordResource name of the resource file associated with the given
* class
* @return a {@link Set} with the file's words
* @see #getSnowballWordSet(Reader)
*/
public static Set<String> getSnowballWordSet(Class<?> aClass,
String stopwordResource) throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
try {
return getSnowballWordSet(reader);
} finally {
reader.close();
}
}
/**
* Reads stopwords from a stopword list in Snowball format.
* <p>
* The snowball format is the following:
* <ul>
* <li>Lines may contain multiple words separated by whitespace.
* <li>The comment character is the vertical line (|).
* <li>Lines may contain trailing comments.
* </ul>
* </p>
*
* @param reader Reader containing a Snowball stopword list
* @return A Set with the reader's words
*/
public static Set<String> getSnowballWordSet(Reader reader)
throws IOException {
final Set<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String line = null;
while ((line = br.readLine()) != null) {
int comment = line.indexOf('|');
if (comment >= 0) line = line.substring(0, comment);
String words[] = line.split("\\s+");
for (int i = 0; i < words.length; i++)
if (words[i].length() > 0) result.add(words[i]);
}
} finally {
if (br != null) br.close();
}
return result;
}
/**
* Reads a stem dictionary. Each line contains:
* <pre>word<b>\t</b>stem</pre>
* (i.e. two tab separated words)
*
* @return stem dictionary that overrules the stemming algorithm
* @throws IOException
*/
public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
final HashMap<String, String> result = new HashMap<String,String>();
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(wordstemfile));
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
if(br != null)
br.close();
}
return result;
}
}