/* * Licensed to CRATE Technology GmbH ("Crate") under one or more contributor * license agreements. See the NOTICE file distributed with this work for * additional information regarding copyright ownership. Crate licenses * this file to you under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * However, if you have executed another commercial license agreement * with Crate these terms will supersede the license and you may use the * software solely pursuant to the terms of the relevant commercial agreement. */ package io.crate.metadata; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableSet; import io.crate.exceptions.AnalyzerInvalidException; import io.crate.exceptions.AnalyzerUnknownException; import io.crate.metadata.settings.AnalyzerSettings; import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.loader.JsonSettingsLoader; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.analysis.AnalysisRegistry; import java.io.IOException; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.Set; /** * Service to get builtin and custom analyzers, tokenizers, token_filters, char_filters */ public class FulltextAnalyzerResolver { private final ClusterService clusterService; private final AnalysisRegistry analysisRegistry; // redefined list of extended analyzers not available outside of // a concrete index (see AnalyzerModule.ExtendedProcessor) // stripped Prebuilt<Thingy> (e.g. PreBuiltTokenFilters) private static final ImmutableSet<String> EXTENDED_BUILTIN_TOKEN_FILTERS = ImmutableSet.of( "limit", "delimited_payload_filter", "synonym", "keep", "pattern_capture", "pattern_replace", "dictionary_decompounder", "hyphenation_decompounder", "keyword_marker", "stemmer_override", "hunspell", "cjk_bigram", "cjk_width"); private static final ImmutableSet<String> EXTENDED_BUILTIN_CHAR_FILTERS = ImmutableSet .of("mapping", "pattern_replace"); // used for saving the creation statement public static final String SQL_STATEMENT_KEY = "_sql_stmt"; private static final Logger logger = Loggers.getLogger(FulltextAnalyzerResolver.class); public enum CustomType { ANALYZER("analyzer"), TOKENIZER("tokenizer"), TOKEN_FILTER("filter"), CHAR_FILTER("char_filter"); private String name; CustomType(String name) { this.name = name; } public String getName() { return this.name; } } @Inject public FulltextAnalyzerResolver(ClusterService clusterService, AnalysisRegistry analysisRegistry) { this.clusterService = clusterService; this.analysisRegistry = analysisRegistry; } public boolean hasAnalyzer(String name) { return hasBuiltInAnalyzer(name) || hasCustomAnalyzer(name); } public boolean hasBuiltInAnalyzer(String name) { try { return analysisRegistry.getAnalyzer(name) != null; } catch (IOException e) { return false; } } /** * get all the builtin Analyzers defined in Crate * * @return an Iterable of Strings */ public Set<String> getBuiltInAnalyzers() { return new ImmutableSet.Builder<String>() .addAll(analysisRegistry.getAnalyzers().keySet()).build(); } /** * get the custom analyzer created by the CREATE ANALYZER command. * This does not include definitions for custom tokenizers, token-filters or char-filters * * @param name the name of the analyzer * @return Settings defining a custom Analyzer */ public Settings getCustomAnalyzer(String name) { return getCustomThingy(name, CustomType.ANALYZER); } public Map<String, Settings> getCustomAnalyzers() throws IOException { Map<String, Settings> result = new HashMap<>(); for (Map.Entry<String, String> entry : getCustomThingies(CustomType.ANALYZER) .getAsMap().entrySet()) { if (!entry.getKey().endsWith("." + SQL_STATEMENT_KEY)) { result.put(entry.getKey(), decodeSettings(entry.getValue())); } } return result; } public boolean hasCustomAnalyzer(String name) { return hasCustomThingy(name, CustomType.ANALYZER); } public boolean hasBuiltInTokenizer(String name) { return analysisRegistry.getTokenizerProvider(name) != null; } public Set<String> getBuiltInTokenizers() { return new ImmutableSet.Builder<String>() .addAll(analysisRegistry.getTokenizers().keySet()) .build(); } public Map<String, Settings> getCustomTokenizers() throws IOException { Map<String, Settings> result = new HashMap<>(); for (Map.Entry<String, String> entry : getCustomThingies(CustomType.TOKENIZER).getAsMap ().entrySet()) { result.put(entry.getKey(), decodeSettings(entry.getValue())); } return result; } public boolean hasBuiltInCharFilter(String name) { return EXTENDED_BUILTIN_CHAR_FILTERS.contains(name) || analysisRegistry.getCharFilterProvider(name) != null; } public Set<String> getBuiltInCharFilters() { return new ImmutableSet.Builder<String>().addAll(EXTENDED_BUILTIN_CHAR_FILTERS) .addAll(analysisRegistry.getCharFilters().keySet()) .build(); } public Map<String, Settings> getCustomCharFilters() throws IOException { Map<String, Settings> result = new HashMap<>(); for (Map.Entry<String, String> entry : getCustomThingies(CustomType.CHAR_FILTER).getAsMap ().entrySet()) { result.put(entry.getKey(), decodeSettings(entry.getValue())); } return result; } public boolean hasBuiltInTokenFilter(String name) { return EXTENDED_BUILTIN_TOKEN_FILTERS.contains(name) || analysisRegistry.getTokenFilterProvider(name) != null; } public Set<String> getBuiltInTokenFilters() { return new ImmutableSet.Builder<String>() .addAll(EXTENDED_BUILTIN_TOKEN_FILTERS) .addAll(analysisRegistry.getTokenFilters().keySet()) .build(); } public Map<String, Settings> getCustomTokenFilters() throws IOException { Map<String, Settings> result = new HashMap<>(); for (Map.Entry<String, String> entry : getCustomThingies(CustomType.TOKEN_FILTER).getAsMap ().entrySet()) { result.put(entry.getKey(), decodeSettings(entry.getValue())); } return result; } public static BytesReference encodeSettings(Settings settings) { try { BytesStreamOutput bso = new BytesStreamOutput(); XContentBuilder builder = XContentFactory.jsonBuilder(bso); builder.startObject(); for (Map.Entry<String, String> entry : settings.getAsMap().entrySet()) { builder.field(entry.getKey(), entry.getValue()); } builder.endObject(); builder.flush(); return bso.bytes(); } catch (IOException e) { // this is a memory stream so no real I/O happens and a IOException can't really happen at runtime throw Throwables.propagate(e); } } public static Settings decodeSettings(String encodedSettings) throws IOException { Map<String, String> loaded = new JsonSettingsLoader(false).load(encodedSettings); return Settings.builder().put(loaded).build(); } /** * used to get custom analyzers, tokenizers, token-filters or char-filters with name ``name`` * from crate-cluster-settings * * @param name * @param type * @return a full settings instance for the thingy with given name and type or null if it does not exists */ private Settings getCustomThingy(String name, CustomType type) { if (name == null) { return null; } String encodedSettings = clusterService.state().metaData().persistentSettings().get( String.format(Locale.ENGLISH, "%s%s.%s", AnalyzerSettings.CUSTOM_ANALYSIS_SETTINGS_PREFIX, type.getName(), name) ); Settings decoded = null; if (encodedSettings != null) { try { decoded = decodeSettings(encodedSettings); } catch (IOException e) { logger.warn("Could not decode settings for {} '{}'.", e, type.getName(), name); } } return decoded; } private Settings getCustomThingies(CustomType type) { Map<String, Settings> settingsMap = clusterService.state().metaData().persistentSettings ().getGroups(AnalyzerSettings.CUSTOM_ANALYSIS_SETTINGS_PREFIX); Settings result = settingsMap.get(type.getName()); return result != null ? result : Settings.EMPTY; } /** * used to check if custom analyzer, tokenizer, token-filter or char-filter with name ``name`` exists * * @param name * @param type * @return true if exists, false otherwise */ private boolean hasCustomThingy(String name, CustomType type) { return clusterService.state().metaData().persistentSettings().getAsMap().containsKey( String.format(Locale.ROOT, "%s%s.%s", AnalyzerSettings.CUSTOM_ANALYSIS_SETTINGS_PREFIX, type.getName(), name)); } /** * resolve the full settings necessary for the custom analyzer with name ``name`` * to be included in index-settings to get applied on an index. * <p> * Resolves all custom tokenizer, token-filter and char-filter settings and includes them * * @param name the name of the analyzer to resolve * @return Settings ready for inclusion into a CreateIndexRequest * @throws AnalyzerInvalidException if no custom analyzer with name ``name`` could be found */ public Settings resolveFullCustomAnalyzerSettings(String name) throws AnalyzerInvalidException { Settings.Builder builder = Settings.builder(); Settings analyzerSettings = getCustomAnalyzer(name); if (analyzerSettings != null) { builder.put(analyzerSettings); String tokenizerName = analyzerSettings.get(String.format(Locale.ENGLISH, "index.analysis.analyzer.%s.tokenizer", name)); if (tokenizerName != null) { Settings customTokenizerSettings = getCustomTokenizer(tokenizerName); if (customTokenizerSettings != null) { builder.put(customTokenizerSettings); } else if (!hasBuiltInTokenizer(tokenizerName)) { throw new AnalyzerInvalidException(String.format(Locale.ENGLISH, "Invalid Analyzer: could not resolve tokenizer '%s'", tokenizerName)); } } String[] tokenFilterNames = analyzerSettings.getAsArray(String.format(Locale.ENGLISH, "index.analysis.analyzer.%s.filter", name)); for (int i = 0; i < tokenFilterNames.length; i++) { Settings customTokenFilterSettings = getCustomTokenFilter(tokenFilterNames[i]); if (customTokenFilterSettings != null) { builder.put(customTokenFilterSettings); } else if (!hasBuiltInTokenFilter(tokenFilterNames[i])) { throw new AnalyzerInvalidException(String.format(Locale.ENGLISH, "Invalid Analyzer: could not resolve token-filter '%s'", tokenFilterNames[i])); } } String[] charFilterNames = analyzerSettings.getAsArray(String.format(Locale.ENGLISH, "index.analysis.analyzer.%s.char_filter", name)); for (int i = 0; i < charFilterNames.length; i++) { Settings customCharFilterSettings = getCustomCharFilter(charFilterNames[i]); if (customCharFilterSettings != null) { builder.put(customCharFilterSettings); } else if (!hasBuiltInCharFilter(charFilterNames[i])) { throw new AnalyzerInvalidException(String.format(Locale.ENGLISH, "Invalid Analyzer: could not resolve char-filter '%s'", charFilterNames[i])); } } } else { throw new AnalyzerUnknownException(name); } return builder.build(); } public Settings getCustomTokenizer(String name) { return getCustomThingy(name, CustomType.TOKENIZER); } public Settings getCustomTokenFilter(String name) { return getCustomThingy(name, CustomType.TOKEN_FILTER); } public Settings getCustomCharFilter(String name) { return getCustomThingy(name, CustomType.CHAR_FILTER); } }