/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.action.admin.indices.analyze; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeReflector; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; import org.elasticsearch.action.support.ActionFilters; import org.elasticsearch.action.support.single.shard.TransportSingleShardAction; import org.elasticsearch.cluster.ClusterService; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.block.ClusterBlockException; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; import org.elasticsearch.cluster.routing.ShardsIterator; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.io.FastStringReader; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexService; import org.elasticsearch.index.analysis.*; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.internal.AllFieldMapper; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.TransportService; import java.io.IOException; import java.io.Reader; import java.util.*; /** * Transport action used to execute analyze requests */ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRequest, AnalyzeResponse> { private final IndicesService indicesService; private final IndicesAnalysisService indicesAnalysisService; private static final Settings DEFAULT_SETTINGS = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); @Inject public TransportAnalyzeAction(Settings settings, ThreadPool threadPool, ClusterService clusterService, TransportService transportService, IndicesService indicesService, IndicesAnalysisService indicesAnalysisService, ActionFilters actionFilters, IndexNameExpressionResolver indexNameExpressionResolver) { super(settings, AnalyzeAction.NAME, threadPool, clusterService, transportService, actionFilters, indexNameExpressionResolver, AnalyzeRequest.class, ThreadPool.Names.INDEX); this.indicesService = indicesService; this.indicesAnalysisService = indicesAnalysisService; } @Override protected AnalyzeResponse newResponse() { return new AnalyzeResponse(); } @Override protected boolean resolveIndex(AnalyzeRequest request) { return request.index() != null; } @Override protected ClusterBlockException checkRequestBlock(ClusterState state, InternalRequest request) { if (request.concreteIndex() != null) { return super.checkRequestBlock(state, request); } return null; } @Override protected ShardsIterator shards(ClusterState state, InternalRequest request) { if (request.concreteIndex() == null) { // just execute locally.... return null; } return state.routingTable().index(request.concreteIndex()).randomAllActiveShardsIt(); } @Override protected AnalyzeResponse shardOperation(AnalyzeRequest request, ShardId shardId) { IndexService indexService = null; if (shardId != null) { indexService = indicesService.indexServiceSafe(shardId.getIndex()); } Analyzer analyzer = null; boolean closeAnalyzer = false; String field = null; if (request.field() != null) { if (indexService == null) { throw new IllegalArgumentException("No index provided, and trying to analyzer based on a specific field which requires the index parameter"); } MappedFieldType fieldType = indexService.mapperService().smartNameFieldType(request.field()); if (fieldType != null) { if (fieldType.isNumeric()) { throw new IllegalArgumentException("Can't process field [" + request.field() + "], Analysis requests are not supported on numeric fields"); } analyzer = fieldType.indexAnalyzer(); field = fieldType.names().indexName(); } } if (field == null) { if (indexService != null) { field = indexService.queryParserService().defaultField(); } else { field = AllFieldMapper.NAME; } } if (analyzer == null && request.analyzer() != null) { if (indexService == null) { analyzer = indicesAnalysisService.analyzer(request.analyzer()); } else { analyzer = indexService.analysisService().analyzer(request.analyzer()); } if (analyzer == null) { throw new IllegalArgumentException("failed to find analyzer [" + request.analyzer() + "]"); } } else if (request.tokenizer() != null) { TokenizerFactory tokenizerFactory; if (indexService == null) { TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(request.tokenizer()); if (tokenizerFactoryFactory == null) { throw new IllegalArgumentException("failed to find global tokenizer under [" + request.tokenizer() + "]"); } tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(), DEFAULT_SETTINGS); } else { tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer()); if (tokenizerFactory == null) { throw new IllegalArgumentException("failed to find tokenizer under [" + request.tokenizer() + "]"); } } TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; if (request.tokenFilters() != null && request.tokenFilters().length > 0) { tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length]; for (int i = 0; i < request.tokenFilters().length; i++) { String tokenFilterName = request.tokenFilters()[i]; if (indexService == null) { TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService.tokenFilterFactoryFactory(tokenFilterName); if (tokenFilterFactoryFactory == null) { throw new IllegalArgumentException("failed to find global token filter under [" + tokenFilterName + "]"); } tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName, DEFAULT_SETTINGS); } else { tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName); if (tokenFilterFactories[i] == null) { throw new IllegalArgumentException("failed to find token filter under [" + tokenFilterName + "]"); } } if (tokenFilterFactories[i] == null) { throw new IllegalArgumentException("failed to find token filter under [" + tokenFilterName + "]"); } } } CharFilterFactory[] charFilterFactories = new CharFilterFactory[0]; if (request.charFilters() != null && request.charFilters().length > 0) { charFilterFactories = new CharFilterFactory[request.charFilters().length]; for (int i = 0; i < request.charFilters().length; i++) { String charFilterName = request.charFilters()[i]; if (indexService == null) { CharFilterFactoryFactory charFilterFactoryFactory = indicesAnalysisService.charFilterFactoryFactory(charFilterName); if (charFilterFactoryFactory == null) { throw new IllegalArgumentException("failed to find global char filter under [" + charFilterName + "]"); } charFilterFactories[i] = charFilterFactoryFactory.create(charFilterName, DEFAULT_SETTINGS); } else { charFilterFactories[i] = indexService.analysisService().charFilter(charFilterName); if (charFilterFactories[i] == null) { throw new IllegalArgumentException("failed to find token char under [" + charFilterName + "]"); } } if (charFilterFactories[i] == null) { throw new IllegalArgumentException("failed to find token char under [" + charFilterName + "]"); } } } analyzer = new CustomAnalyzer(tokenizerFactory, charFilterFactories, tokenFilterFactories); closeAnalyzer = true; } else if (analyzer == null) { if (indexService == null) { analyzer = indicesAnalysisService.analyzer("standard"); } else { analyzer = indexService.analysisService().defaultIndexAnalyzer(); } } if (analyzer == null) { throw new IllegalArgumentException("failed to find analyzer"); } List<AnalyzeResponse.AnalyzeToken> tokens = null; DetailAnalyzeResponse detail = null; if (request.explain()) { detail = detailAnalyze(request, analyzer, field); } else { tokens = simpleAnalyze(request, analyzer, field); } if (closeAnalyzer) { analyzer.close(); } return new AnalyzeResponse(tokens, detail); } private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) { List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>(); int lastPosition = -1; int lastOffset = 0; for (String text : request.text()) { try (TokenStream stream = analyzer.tokenStream(field, text)) { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), null)); } stream.end(); lastOffset += offset.endOffset(); lastPosition += posIncr.getPositionIncrement(); lastPosition += analyzer.getPositionIncrementGap(field); lastOffset += analyzer.getOffsetGap(field); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } } return tokens; } private static DetailAnalyzeResponse detailAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) { DetailAnalyzeResponse detailResponse; final Set<String> includeAttributes = new HashSet<>(); if (request.attributes() != null) { for (String attribute : request.attributes()) { includeAttributes.add(attribute.toLowerCase(Locale.ROOT)); } } CustomAnalyzer customAnalyzer = null; if (analyzer instanceof CustomAnalyzer) { customAnalyzer = (CustomAnalyzer) analyzer; } else if (analyzer instanceof NamedAnalyzer && ((NamedAnalyzer) analyzer).analyzer() instanceof CustomAnalyzer) { customAnalyzer = (CustomAnalyzer) ((NamedAnalyzer) analyzer).analyzer(); } if (customAnalyzer != null) { // customAnalyzer = divide charfilter, tokenizer tokenfilters CharFilterFactory[] charFilterFactories = customAnalyzer.charFilters(); TokenizerFactory tokenizerFactory = customAnalyzer.tokenizerFactory(); TokenFilterFactory[] tokenFilterFactories = customAnalyzer.tokenFilters(); String[][] charFiltersTexts = new String[charFilterFactories != null ? charFilterFactories.length : 0][request.text().length]; TokenListCreator[] tokenFiltersTokenListCreator = new TokenListCreator[tokenFilterFactories != null ? tokenFilterFactories.length : 0]; TokenListCreator tokenizerTokenListCreator = new TokenListCreator(); for (int textIndex = 0; textIndex < request.text().length; textIndex++) { String charFilteredSource = request.text()[textIndex]; Reader reader = new FastStringReader(charFilteredSource); if (charFilterFactories != null) { for (int charFilterIndex = 0; charFilterIndex < charFilterFactories.length; charFilterIndex++) { reader = charFilterFactories[charFilterIndex].create(reader); Reader readerForWriteOut = new FastStringReader(charFilteredSource); readerForWriteOut = charFilterFactories[charFilterIndex].create(readerForWriteOut); charFilteredSource = writeCharStream(readerForWriteOut); charFiltersTexts[charFilterIndex][textIndex] = charFilteredSource; } } // analyzing only tokenizer Tokenizer tokenizer = tokenizerFactory.create(); tokenizer.setReader(reader); tokenizerTokenListCreator.analyze(tokenizer, customAnalyzer, field, includeAttributes); // analyzing each tokenfilter if (tokenFilterFactories != null) { for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFilterFactories.length; tokenFilterIndex++) { if (tokenFiltersTokenListCreator[tokenFilterIndex] == null) { tokenFiltersTokenListCreator[tokenFilterIndex] = new TokenListCreator(); } TokenStream stream = createStackedTokenStream(request.text()[textIndex], charFilterFactories, tokenizerFactory, tokenFilterFactories, tokenFilterIndex + 1); tokenFiltersTokenListCreator[tokenFilterIndex].analyze(stream, customAnalyzer, field, includeAttributes); } } } DetailAnalyzeResponse.CharFilteredText[] charFilteredLists = new DetailAnalyzeResponse.CharFilteredText[charFiltersTexts.length]; if (charFilterFactories != null) { for (int charFilterIndex = 0; charFilterIndex < charFiltersTexts.length; charFilterIndex++) { charFilteredLists[charFilterIndex] = new DetailAnalyzeResponse.CharFilteredText( charFilterFactories[charFilterIndex].name(), charFiltersTexts[charFilterIndex]); } } DetailAnalyzeResponse.AnalyzeTokenList[] tokenFilterLists = new DetailAnalyzeResponse.AnalyzeTokenList[tokenFiltersTokenListCreator.length]; if (tokenFilterFactories != null) { for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFiltersTokenListCreator.length; tokenFilterIndex++) { tokenFilterLists[tokenFilterIndex] = new DetailAnalyzeResponse.AnalyzeTokenList( tokenFilterFactories[tokenFilterIndex].name(), tokenFiltersTokenListCreator[tokenFilterIndex].getArrayTokens()); } } detailResponse = new DetailAnalyzeResponse(charFilteredLists, new DetailAnalyzeResponse.AnalyzeTokenList(tokenizerFactory.name(), tokenizerTokenListCreator.getArrayTokens()), tokenFilterLists); } else { String name; if (analyzer instanceof NamedAnalyzer) { name = ((NamedAnalyzer) analyzer).name(); } else { name = analyzer.getClass().getName(); } TokenListCreator tokenListCreator = new TokenListCreator(); for (String text : request.text()) { tokenListCreator.analyze(analyzer.tokenStream(field, text), analyzer, field, includeAttributes); } detailResponse = new DetailAnalyzeResponse(new DetailAnalyzeResponse.AnalyzeTokenList(name, tokenListCreator.getArrayTokens())); } return detailResponse; } private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) { Reader reader = new FastStringReader(source); for (CharFilterFactory charFilterFactory : charFilterFactories) { reader = charFilterFactory.create(reader); } Tokenizer tokenizer = tokenizerFactory.create(); tokenizer.setReader(reader); TokenStream tokenStream = tokenizer; for (int i = 0; i < current; i++) { tokenStream = tokenFilterFactories[i].create(tokenStream); } return tokenStream; } private static String writeCharStream(Reader input) { final int BUFFER_SIZE = 1024; char[] buf = new char[BUFFER_SIZE]; int len; StringBuilder sb = new StringBuilder(); do { try { len = input.read(buf, 0, BUFFER_SIZE); } catch (IOException e) { throw new ElasticsearchException("failed to analyze (charFiltering)", e); } if (len > 0) sb.append(buf, 0, len); } while (len == BUFFER_SIZE); return sb.toString(); } private static class TokenListCreator { int lastPosition = -1; int lastOffset = 0; List<AnalyzeResponse.AnalyzeToken> tokens; TokenListCreator() { tokens = new ArrayList<>(); } private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) { try { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes))); } stream.end(); lastOffset += offset.endOffset(); lastPosition += posIncr.getPositionIncrement(); lastPosition += analyzer.getPositionIncrementGap(field); lastOffset += analyzer.getOffsetGap(field); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { IOUtils.closeWhileHandlingException(stream); } } private AnalyzeResponse.AnalyzeToken[] getArrayTokens() { return tokens.toArray(new AnalyzeResponse.AnalyzeToken[tokens.size()]); } } /** * other attribute extract object. * Extracted object group by AttributeClassName * * @param stream current TokenStream * @param includeAttributes filtering attributes * @return Map<key value> */ private static Map<String, Object> extractExtendedAttributes(TokenStream stream, final Set<String> includeAttributes) { final Map<String, Object> extendedAttributes = new TreeMap<>(); stream.reflectWith(new AttributeReflector() { @Override public void reflect(Class<? extends Attribute> attClass, String key, Object value) { if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; if (OffsetAttribute.class.isAssignableFrom(attClass)) return; if (TypeAttribute.class.isAssignableFrom(attClass)) return; if (includeAttributes == null || includeAttributes.isEmpty() || includeAttributes.contains(key.toLowerCase(Locale.ROOT))) { if (value instanceof BytesRef) { final BytesRef p = (BytesRef) value; value = p.toString(); } extendedAttributes.put(key, value); } } }); return extendedAttributes; } }