/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterators;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.da.DanishAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.gl.GalicianAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.util.Version;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.ClientEnvironment;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
/**
*
*/
public class Analysis {
public static boolean isNoStopwords(Settings settings) {
String value = settings.get("stopwords");
return value != null && "_none_".equals(value);
}
public static Set<?> parseStemExclusion(Settings settings, Set<?> defaultStemExclusion) {
String value = settings.get("stem_exclusion");
if (value != null) {
if ("_none_".equals(value)) {
return ImmutableSet.of();
} else {
return ImmutableSet.copyOf(Strings.commaDelimitedListToSet(value));
}
}
String[] stopWords = settings.getAsArray("stem_exclusion", null);
if (stopWords != null) {
return ImmutableSet.copyOf(Iterators.forArray(stopWords));
} else {
return defaultStemExclusion;
}
}
public static final ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder()
.put("_arabic_", ArabicAnalyzer.getDefaultStopSet())
.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet())
.put("_basque_", BasqueAnalyzer.getDefaultStopSet())
.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet())
.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet())
.put("_catalan_", CatalanAnalyzer.getDefaultStopSet())
.put("_czech_", CzechAnalyzer.getDefaultStopSet())
.put("_danish_", DanishAnalyzer.getDefaultStopSet())
.put("_dutch_", DutchAnalyzer.getDefaultStopSet())
.put("_english_", EnglishAnalyzer.getDefaultStopSet())
.put("_finnish_", FinnishAnalyzer.getDefaultStopSet())
.put("_french_", FrenchAnalyzer.getDefaultStopSet())
.put("_galician_", GalicianAnalyzer.getDefaultStopSet())
.put("_german_", GermanAnalyzer.getDefaultStopSet())
.put("_greek_", GreekAnalyzer.getDefaultStopSet())
.put("_hindi_", HindiAnalyzer.getDefaultStopSet())
.put("_hungarian_", HungarianAnalyzer.getDefaultStopSet())
.put("_indonesian_", IndonesianAnalyzer.getDefaultStopSet())
.put("_italian_", ItalianAnalyzer.getDefaultStopSet())
.put("_norwegian_", NorwegianAnalyzer.getDefaultStopSet())
.put("_persian_", PersianAnalyzer.getDefaultStopSet())
.put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet())
.put("_romanian_", RomanianAnalyzer.getDefaultStopSet())
.put("_russian_", RussianAnalyzer.getDefaultStopSet())
.put("_spanish_", SpanishAnalyzer.getDefaultStopSet())
.put("_swedish_", SwedishAnalyzer.getDefaultStopSet())
.put("_turkish_", TurkishAnalyzer.getDefaultStopSet())
.immutableMap();
public static Set<?> parseArticles(ClientEnvironment env, Settings settings, Version version) {
String value = settings.get("articles");
if (value != null) {
if ("_none_".equals(value)) {
return CharArraySet.EMPTY_SET;
} else {
return new CharArraySet(version, Strings.commaDelimitedListToSet(value), settings.getAsBoolean("articles_case", false));
}
}
String[] articles = settings.getAsArray("articles", null);
if (articles != null) {
return new CharArraySet(version, Arrays.asList(articles), settings.getAsBoolean("articles_case", false));
}
CharArraySet pathLoadedArticles = getWordSet(env, settings, "articles", version);
if (pathLoadedArticles != null) {
return pathLoadedArticles;
}
return null;
}
public static Set<?> parseStopWords(ClientEnvironment env, Settings settings, Set<?> defaultStopWords, Version version) {
String value = settings.get("stopwords");
if (value != null) {
if ("_none_".equals(value)) {
return CharArraySet.EMPTY_SET;
} else {
return new CharArraySet(version, Strings.commaDelimitedListToSet(value), settings.getAsBoolean("stopwords_case", false));
}
}
String[] stopWords = settings.getAsArray("stopwords", null);
if (stopWords != null) {
CharArraySet setStopWords = new CharArraySet(version, stopWords.length, settings.getAsBoolean("stopwords_case", false));
for (String stopWord : stopWords) {
if (namedStopWords.containsKey(stopWord)) {
setStopWords.addAll(namedStopWords.get(stopWord));
} else {
setStopWords.add(stopWord);
}
}
return setStopWords;
}
List<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
if (pathLoadedStopWords != null) {
CharArraySet setStopWords = new CharArraySet(version, pathLoadedStopWords.size(), settings.getAsBoolean("stopwords_case", false));
for (String stopWord : pathLoadedStopWords) {
if (namedStopWords.containsKey(stopWord)) {
setStopWords.addAll(namedStopWords.get(stopWord));
} else {
setStopWords.add(stopWord);
}
}
return setStopWords;
}
return defaultStopWords;
}
public static CharArraySet getWordSet(ClientEnvironment env, Settings settings, String settingsPrefix, Version version) {
List<String> wordList = getWordList(env, settings, settingsPrefix);
if (wordList == null) {
return null;
}
return new CharArraySet(version, wordList, settings.getAsBoolean(settingsPrefix + "_case", false));
}
/**
* Fetches a list of words from the specified settings file. The list should either be available at the key
* specified by settingsPrefix or in a file specified by settingsPrefix + _path.
*
* @throws ElasticSearchIllegalArgumentException
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(ClientEnvironment env, Settings settings, String settingPrefix) {
String wordListPath = settings.get(settingPrefix + "_path", null);
if (wordListPath == null) {
String[] explicitWordList = settings.getAsArray(settingPrefix, null);
if (explicitWordList == null) {
return null;
} else {
return Arrays.asList(explicitWordList);
}
}
URL wordListFile = env.resolveConfig(wordListPath);
try {
return loadWordList(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
} catch (IOException ioe) {
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
throw new ElasticSearchIllegalArgumentException(message);
}
}
public static List<String> loadWordList(Reader reader, String comment) throws IOException {
final List<String> result = new ArrayList<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String word = null;
while ((word = br.readLine()) != null) {
if (!Strings.hasText(word)) {
continue;
}
if (!word.startsWith(comment)) {
result.add(word.trim());
}
}
} finally {
if (br != null)
br.close();
}
return result;
}
/**
* @return null If no settings set for "settingsPrefix" then return <code>null</code>.
* @throws ElasticSearchIllegalArgumentException
* If the Reader can not be instantiated.
*/
public static Reader getReaderFromFile(ClientEnvironment env, Settings settings, String settingPrefix) {
String filePath = settings.get(settingPrefix, null);
if (filePath == null) {
return null;
}
URL fileUrl = env.resolveConfig(filePath);
Reader reader = null;
try {
reader = new InputStreamReader(fileUrl.openStream(), Charsets.UTF_8);
} catch (IOException ioe) {
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
throw new ElasticSearchIllegalArgumentException(message);
}
return reader;
}
}