/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.BytesRef; import org.apache.solr.analysis.CharFilterFactory; import org.apache.solr.analysis.TokenFilterFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.analysis.TokenizerFactory; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.SolrException; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.FieldType; import java.io.IOException; import java.io.StringReader; import java.util.*; /** * A base class for all analysis request handlers. * * @version $Id: AnalysisRequestHandlerBase.java 950008 2010-06-01 10:35:13Z rmuir $ * @since solr 1.4 */ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { rsp.add("analysis", doAnalysis(req)); } /** * Performs the analysis based on the given solr request and returns the analysis result as a named list. * * @param req The solr request. * * @return The analysis result as a named list. * * @throws Exception When analysis fails. */ protected abstract NamedList doAnalysis(SolrQueryRequest req) throws Exception; /** * Analyzes the given value using the given Analyzer. * * @param value Value to analyze * @param context The {@link AnalysisContext analysis context}. * * @return NamedList containing the tokens produced by analyzing the given value */ protected NamedList<List<NamedList>> analyzeValue(String value, AnalysisContext context) { Analyzer analyzer = context.getAnalyzer(); if (!TokenizerChain.class.isInstance(analyzer)) { TokenStream tokenStream = null; try { tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value)); tokenStream.reset(); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); } NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>(); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context)); return namedList; } TokenizerChain tokenizerChain = (TokenizerChain) analyzer; CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories(); TokenizerFactory tfac = tokenizerChain.getTokenizerFactory(); TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories(); NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>(); if( cfiltfacs != null ){ String source = value; for(CharFilterFactory cfiltfac : cfiltfacs ){ CharStream reader = CharReader.get(new StringReader(source)); reader = cfiltfac.create(reader); source = writeCharStream(namedList, reader); } } TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value))); List<Token> tokens = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context)); ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens); for (TokenFilterFactory tokenFilterFactory : filtfacs) { tokenStream = tokenFilterFactory.create(listBasedTokenStream); List<Token> tokenList = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokenList, context)); listBasedTokenStream = new ListBasedTokenStream(tokenList); } return namedList; } /** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param value The value to analyze. * @param analyzer The analyzer to use. * * @return The produces token list. */ protected List<Token> analyzeValue(String value, Analyzer analyzer) { TokenStream tokenStream = analyzer.tokenStream("", new StringReader(value)); return analyzeTokenStream(tokenStream); } /** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<Token> analyzeTokenStream(TokenStream tokenStream) { List<Token> tokens = new ArrayList<Token>(); // TODO change this API to support custom attributes CharTermAttribute termAtt = null; TermToBytesRefAttribute bytesAtt = null; if (tokenStream.hasAttribute(CharTermAttribute.class)) { termAtt = tokenStream.getAttribute(CharTermAttribute.class); } else if (tokenStream.hasAttribute(TermToBytesRefAttribute.class)) { bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class); } final OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); final TypeAttribute typeAtt = tokenStream.addAttribute(TypeAttribute.class); final PositionIncrementAttribute posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); final PayloadAttribute payloadAtt = tokenStream.addAttribute(PayloadAttribute.class); final BytesRef bytes = new BytesRef(); try { while (tokenStream.incrementToken()) { Token token = new Token(); if (termAtt != null) { token.setEmpty().append(termAtt); } if (bytesAtt != null) { bytesAtt.toBytesRef(bytes); // TODO: This is incorrect when numeric fields change in later lucene versions. It should use BytesRef directly! token.setEmpty().append(bytes.utf8ToString()); } token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setFlags(flagsAtt.getFlags()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); tokens.add((Token) token.clone()); } } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } return tokens; } /** * Converts the list of Tokens to a list of NamedLists representing the tokens. * * @param tokens Tokens to convert * @param context The analysis context * * @return List of NamedLists containing the relevant information taken from the tokens */ private List<NamedList> convertTokensToNamedLists(List<Token> tokens, AnalysisContext context) { List<NamedList> tokensNamedLists = new ArrayList<NamedList>(); Collections.sort(tokens, new Comparator<Token>() { public int compare(Token o1, Token o2) { return o1.endOffset() - o2.endOffset(); } }); int position = 0; FieldType fieldType = context.getFieldType(); for (Token token : tokens) { NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>(); String text = fieldType.indexedToReadable(token.toString()); tokenNamedList.add("text", text); if (!text.equals(token.toString())) { tokenNamedList.add("raw_text", token.toString()); } tokenNamedList.add("type", token.type()); tokenNamedList.add("start", token.startOffset()); tokenNamedList.add("end", token.endOffset()); position += token.getPositionIncrement(); tokenNamedList.add("position", position); if (context.getTermsToMatch().contains(token.toString())) { tokenNamedList.add("match", true); } if (token.getPayload() != null) { tokenNamedList.add("payload", token.getPayload()); } tokensNamedLists.add(tokenNamedList); } return tokensNamedLists; } private String writeCharStream(NamedList out, CharStream input ){ final int BUFFER_SIZE = 1024; char[] buf = new char[BUFFER_SIZE]; int len = 0; StringBuilder sb = new StringBuilder(); do { try { len = input.read( buf, 0, BUFFER_SIZE ); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); } if( len > 0 ) sb.append(buf, 0, len); } while( len == BUFFER_SIZE ); out.add( input.getClass().getName(), sb.toString()); return sb.toString(); } // ================================================= Inner classes ================================================= /** * TokenStream that iterates over a list of pre-existing Tokens */ // TODO refactor to support custom attributes protected final static class ListBasedTokenStream extends TokenStream { private final List<Token> tokens; private Iterator<Token> tokenIterator; private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); /** * Creates a new ListBasedTokenStream which uses the given tokens as its token source. * * @param tokens Source of tokens to be used */ ListBasedTokenStream(List<Token> tokens) { this.tokens = tokens; tokenIterator = tokens.iterator(); } /** * {@inheritDoc} */ @Override public boolean incrementToken() throws IOException { if (tokenIterator.hasNext()) { Token next = tokenIterator.next(); termAtt.copyBuffer(next.buffer(), 0, next.length()); typeAtt.setType(next.type()); offsetAtt.setOffset(next.startOffset(), next.endOffset()); flagsAtt.setFlags(next.getFlags()); payloadAtt.setPayload(next.getPayload()); posIncAtt.setPositionIncrement(next.getPositionIncrement()); return true; } else { return false; } } @Override public void reset() throws IOException { super.reset(); tokenIterator = tokens.iterator(); } } /** * Serves as the context of an analysis process. This context contains the following constructs */ protected static class AnalysisContext { private final String fieldName; private final FieldType fieldType; private final Analyzer analyzer; private final Set<String> termsToMatch; /** * Constructs a new AnalysisContext with a given field tpe, analyzer and * termsToMatch. By default the field name in this context will be * {@code null}. During the analysis processs, The produced tokens will * be compaired to the terms in the {@code termsToMatch} set. When found, * these tokens will be marked as a match. * * @param fieldType The type of the field the analysis is performed on. * @param analyzer The analyzer to be used. * @param termsToMatch Holds all the terms that should match during the * analysis process. */ public AnalysisContext(FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) { this(null, fieldType, analyzer, termsToMatch); } /** * Constructs an AnalysisContext with a given field name, field type * and analyzer. By default this context will hold no terms to match * * @param fieldName The name of the field the analysis is performed on * (may be {@code null}). * @param fieldType The type of the field the analysis is performed on. * @param analyzer The analyzer to be used during the analysis process. * */ public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer) { this(fieldName, fieldType, analyzer, Collections.EMPTY_SET); } /** * Constructs a new AnalysisContext with a given field tpe, analyzer and * termsToMatch. During the analysis processs, The produced tokens will be * compaired to the termes in the {@code termsToMatch} set. When found, * these tokens will be marked as a match. * * @param fieldName The name of the field the analysis is performed on * (may be {@code null}). * @param fieldType The type of the field the analysis is performed on. * @param analyzer The analyzer to be used. * @param termsToMatch Holds all the terms that should match during the * analysis process. */ public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) { this.fieldName = fieldName; this.fieldType = fieldType; this.analyzer = analyzer; this.termsToMatch = termsToMatch; } public String getFieldName() { return fieldName; } public FieldType getFieldType() { return fieldType; } public Analyzer getAnalyzer() { return analyzer; } public Set<String> getTermsToMatch() { return termsToMatch; } } }