/* * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.analysis; import com.google.common.base.Charsets; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterators; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.bg.BulgarianAnalyzer; import org.apache.lucene.analysis.br.BrazilianAnalyzer; import org.apache.lucene.analysis.ca.CatalanAnalyzer; import org.apache.lucene.analysis.cz.CzechAnalyzer; import org.apache.lucene.analysis.da.DanishAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.el.GreekAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.es.SpanishAnalyzer; import org.apache.lucene.analysis.eu.BasqueAnalyzer; import org.apache.lucene.analysis.fa.PersianAnalyzer; import org.apache.lucene.analysis.fi.FinnishAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.gl.GalicianAnalyzer; import org.apache.lucene.analysis.hi.HindiAnalyzer; import org.apache.lucene.analysis.hu.HungarianAnalyzer; import org.apache.lucene.analysis.hy.ArmenianAnalyzer; import org.apache.lucene.analysis.id.IndonesianAnalyzer; import org.apache.lucene.analysis.it.ItalianAnalyzer; import org.apache.lucene.analysis.nl.DutchAnalyzer; import org.apache.lucene.analysis.no.NorwegianAnalyzer; import org.apache.lucene.analysis.pt.PortugueseAnalyzer; import org.apache.lucene.analysis.ro.RomanianAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.sv.SwedishAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.util.Version; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.Strings; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.ClientEnvironment; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Set; /** * */ public class Analysis { public static boolean isNoStopwords(Settings settings) { String value = settings.get("stopwords"); return value != null && "_none_".equals(value); } public static Set<?> parseStemExclusion(Settings settings, Set<?> defaultStemExclusion) { String value = settings.get("stem_exclusion"); if (value != null) { if ("_none_".equals(value)) { return ImmutableSet.of(); } else { return ImmutableSet.copyOf(Strings.commaDelimitedListToSet(value)); } } String[] stopWords = settings.getAsArray("stem_exclusion", null); if (stopWords != null) { return ImmutableSet.copyOf(Iterators.forArray(stopWords)); } else { return defaultStemExclusion; } } public static final ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder() .put("_arabic_", ArabicAnalyzer.getDefaultStopSet()) .put("_armenian_", ArmenianAnalyzer.getDefaultStopSet()) .put("_basque_", BasqueAnalyzer.getDefaultStopSet()) .put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet()) .put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet()) .put("_catalan_", CatalanAnalyzer.getDefaultStopSet()) .put("_czech_", CzechAnalyzer.getDefaultStopSet()) .put("_danish_", DanishAnalyzer.getDefaultStopSet()) .put("_dutch_", DutchAnalyzer.getDefaultStopSet()) .put("_english_", EnglishAnalyzer.getDefaultStopSet()) .put("_finnish_", FinnishAnalyzer.getDefaultStopSet()) .put("_french_", FrenchAnalyzer.getDefaultStopSet()) .put("_galician_", GalicianAnalyzer.getDefaultStopSet()) .put("_german_", GermanAnalyzer.getDefaultStopSet()) .put("_greek_", GreekAnalyzer.getDefaultStopSet()) .put("_hindi_", HindiAnalyzer.getDefaultStopSet()) .put("_hungarian_", HungarianAnalyzer.getDefaultStopSet()) .put("_indonesian_", IndonesianAnalyzer.getDefaultStopSet()) .put("_italian_", ItalianAnalyzer.getDefaultStopSet()) .put("_norwegian_", NorwegianAnalyzer.getDefaultStopSet()) .put("_persian_", PersianAnalyzer.getDefaultStopSet()) .put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet()) .put("_romanian_", RomanianAnalyzer.getDefaultStopSet()) .put("_russian_", RussianAnalyzer.getDefaultStopSet()) .put("_spanish_", SpanishAnalyzer.getDefaultStopSet()) .put("_swedish_", SwedishAnalyzer.getDefaultStopSet()) .put("_turkish_", TurkishAnalyzer.getDefaultStopSet()) .immutableMap(); public static Set<?> parseArticles(ClientEnvironment env, Settings settings, Version version) { String value = settings.get("articles"); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { return new CharArraySet(version, Strings.commaDelimitedListToSet(value), settings.getAsBoolean("articles_case", false)); } } String[] articles = settings.getAsArray("articles", null); if (articles != null) { return new CharArraySet(version, Arrays.asList(articles), settings.getAsBoolean("articles_case", false)); } CharArraySet pathLoadedArticles = getWordSet(env, settings, "articles", version); if (pathLoadedArticles != null) { return pathLoadedArticles; } return null; } public static Set<?> parseStopWords(ClientEnvironment env, Settings settings, Set<?> defaultStopWords, Version version) { String value = settings.get("stopwords"); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { return new CharArraySet(version, Strings.commaDelimitedListToSet(value), settings.getAsBoolean("stopwords_case", false)); } } String[] stopWords = settings.getAsArray("stopwords", null); if (stopWords != null) { CharArraySet setStopWords = new CharArraySet(version, stopWords.length, settings.getAsBoolean("stopwords_case", false)); for (String stopWord : stopWords) { if (namedStopWords.containsKey(stopWord)) { setStopWords.addAll(namedStopWords.get(stopWord)); } else { setStopWords.add(stopWord); } } return setStopWords; } List<String> pathLoadedStopWords = getWordList(env, settings, "stopwords"); if (pathLoadedStopWords != null) { CharArraySet setStopWords = new CharArraySet(version, pathLoadedStopWords.size(), settings.getAsBoolean("stopwords_case", false)); for (String stopWord : pathLoadedStopWords) { if (namedStopWords.containsKey(stopWord)) { setStopWords.addAll(namedStopWords.get(stopWord)); } else { setStopWords.add(stopWord); } } return setStopWords; } return defaultStopWords; } public static CharArraySet getWordSet(ClientEnvironment env, Settings settings, String settingsPrefix, Version version) { List<String> wordList = getWordList(env, settings, settingsPrefix); if (wordList == null) { return null; } return new CharArraySet(version, wordList, settings.getAsBoolean(settingsPrefix + "_case", false)); } /** * Fetches a list of words from the specified settings file. The list should either be available at the key * specified by settingsPrefix or in a file specified by settingsPrefix + _path. * * @throws ElasticSearchIllegalArgumentException * If the word list cannot be found at either key. */ public static List<String> getWordList(ClientEnvironment env, Settings settings, String settingPrefix) { String wordListPath = settings.get(settingPrefix + "_path", null); if (wordListPath == null) { String[] explicitWordList = settings.getAsArray(settingPrefix, null); if (explicitWordList == null) { return null; } else { return Arrays.asList(explicitWordList); } } URL wordListFile = env.resolveConfig(wordListPath); try { return loadWordList(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#"); } catch (IOException ioe) { String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); throw new ElasticSearchIllegalArgumentException(message); } } public static List<String> loadWordList(Reader reader, String comment) throws IOException { final List<String> result = new ArrayList<String>(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { br = (BufferedReader) reader; } else { br = new BufferedReader(reader); } String word = null; while ((word = br.readLine()) != null) { if (!Strings.hasText(word)) { continue; } if (!word.startsWith(comment)) { result.add(word.trim()); } } } finally { if (br != null) br.close(); } return result; } /** * @return null If no settings set for "settingsPrefix" then return <code>null</code>. * @throws ElasticSearchIllegalArgumentException * If the Reader can not be instantiated. */ public static Reader getReaderFromFile(ClientEnvironment env, Settings settings, String settingPrefix) { String filePath = settings.get(settingPrefix, null); if (filePath == null) { return null; } URL fileUrl = env.resolveConfig(filePath); Reader reader = null; try { reader = new InputStreamReader(fileUrl.openStream(), Charsets.UTF_8); } catch (IOException ioe) { String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); throw new ElasticSearchIllegalArgumentException(message); } return reader; } }