/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.search.suggest.phrase; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.script.Template; import org.elasticsearch.search.suggest.SuggestBuilder.SuggestionBuilder; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; /** * Defines the actual suggest command for phrase suggestions ( <tt>phrase</tt>). */ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSuggestionBuilder> { private Float maxErrors; private String separator; private Float realWordErrorLikelihood; private Float confidence; private final Map<String, List<CandidateGenerator>> generators = new HashMap<>(); private Integer gramSize; private SmoothingModel model; private Boolean forceUnigrams; private Integer tokenLimit; private String preTag; private String postTag; private Template collateQuery; private Map<String, Object> collateParams; private Boolean collatePrune; public PhraseSuggestionBuilder(String name) { super(name, "phrase"); } /** * Sets the gram size for the n-gram model used for this suggester. The * default value is <tt>1</tt> corresponding to <tt>unigrams</tt>. Use * <tt>2</tt> for <tt>bigrams</tt> and <tt>3</tt> for <tt>trigrams</tt>. */ public PhraseSuggestionBuilder gramSize(int gramSize) { if (gramSize < 1) { throw new IllegalArgumentException("gramSize must be >= 1"); } this.gramSize = gramSize; return this; } /** * Sets the maximum percentage of the terms that at most considered to be * misspellings in order to form a correction. This method accepts a float * value in the range [0..1) as a fraction of the actual query terms a * number <tt>>=1</tt> as an absolut number of query terms. * * The default is set to <tt>1.0</tt> which corresponds to that only * corrections with at most 1 missspelled term are returned. */ public PhraseSuggestionBuilder maxErrors(Float maxErrors) { this.maxErrors = maxErrors; return this; } /** * Sets the separator that is used to separate terms in the bigram field. If * not set the whitespace character is used as a separator. */ public PhraseSuggestionBuilder separator(String separator) { this.separator = separator; return this; } /** * Sets the likelihood of a term being a misspelled even if the term exists * in the dictionary. The default it <tt>0.95</tt> corresponding to 5% or * the real words are misspelled. */ public PhraseSuggestionBuilder realWordErrorLikelihood(Float realWordErrorLikelihood) { this.realWordErrorLikelihood = realWordErrorLikelihood; return this; } /** * Sets the confidence level for this suggester. The confidence level * defines a factor applied to the input phrases score which is used as a * threshold for other suggest candidates. Only candidates that score higher * than the threshold will be included in the result. For instance a * confidence level of <tt>1.0</tt> will only return suggestions that score * higher than the input phrase. If set to <tt>0.0</tt> the top N candidates * are returned. The default is <tt>1.0</tt> */ public PhraseSuggestionBuilder confidence(Float confidence) { this.confidence = confidence; return this; } /** * Adds a {@link CandidateGenerator} to this suggester. The * {@link CandidateGenerator} is used to draw candidates for each individual * phrase term before the candidates are scored. */ public PhraseSuggestionBuilder addCandidateGenerator(CandidateGenerator generator) { List<CandidateGenerator> list = this.generators.get(generator.getType()); if (list == null) { list = new ArrayList<>(); this.generators.put(generator.getType(), list); } list.add(generator); return this; } /** * Clear the candidate generators. */ public PhraseSuggestionBuilder clearCandidateGenerators() { this.generators.clear(); return this; } /** * If set to <code>true</code> the phrase suggester will fail if the analyzer only * produces ngrams. the default it <code>true</code>. */ public PhraseSuggestionBuilder forceUnigrams(boolean forceUnigrams) { this.forceUnigrams = forceUnigrams; return this; } /** * Sets an explicit smoothing model used for this suggester. The default is * {@link PhraseSuggestionBuilder.StupidBackoff}. */ public PhraseSuggestionBuilder smoothingModel(SmoothingModel model) { this.model = model; return this; } public PhraseSuggestionBuilder tokenLimit(int tokenLimit) { this.tokenLimit = tokenLimit; return this; } /** * Setup highlighting for suggestions. If this is called a highlight field * is returned with suggestions wrapping changed tokens with preTag and postTag. */ public PhraseSuggestionBuilder highlight(String preTag, String postTag) { if ((preTag == null) != (postTag == null)) { throw new IllegalArgumentException("Pre and post tag must both be null or both not be null."); } this.preTag = preTag; this.postTag = postTag; return this; } /** * Sets a query used for filtering out suggested phrases (collation). */ public PhraseSuggestionBuilder collateQuery(String collateQuery) { this.collateQuery = new Template(collateQuery); return this; } /** * Sets a query used for filtering out suggested phrases (collation). */ public PhraseSuggestionBuilder collateQuery(Template collateQueryTemplate) { this.collateQuery = collateQueryTemplate; return this; } /** * Sets additional params for collate script */ public PhraseSuggestionBuilder collateParams(Map<String, Object> collateParams) { this.collateParams = collateParams; return this; } /** * Sets whether to prune suggestions after collation */ public PhraseSuggestionBuilder collatePrune(boolean collatePrune) { this.collatePrune = collatePrune; return this; } @Override public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { if (realWordErrorLikelihood != null) { builder.field("real_word_error_likelihood", realWordErrorLikelihood); } if (confidence != null) { builder.field("confidence", confidence); } if (separator != null) { builder.field("separator", separator); } if (maxErrors != null) { builder.field("max_errors", maxErrors); } if (gramSize != null) { builder.field("gram_size", gramSize); } if (forceUnigrams != null) { builder.field("force_unigrams", forceUnigrams); } if (tokenLimit != null) { builder.field("token_limit", tokenLimit); } if (!generators.isEmpty()) { Set<Entry<String, List<CandidateGenerator>>> entrySet = generators.entrySet(); for (Entry<String, List<CandidateGenerator>> entry : entrySet) { builder.startArray(entry.getKey()); for (CandidateGenerator generator : entry.getValue()) { generator.toXContent(builder, params); } builder.endArray(); } } if (model != null) { builder.startObject("smoothing"); model.toXContent(builder, params); builder.endObject(); } if (preTag != null) { builder.startObject("highlight"); builder.field("pre_tag", preTag); builder.field("post_tag", postTag); builder.endObject(); } if (collateQuery != null) { builder.startObject("collate"); builder.field("query", collateQuery); if (collateParams != null) { builder.field("params", collateParams); } if (collatePrune != null) { builder.field("prune", collatePrune.booleanValue()); } builder.endObject(); } return builder; } /** * Creates a new {@link DirectCandidateGenerator} * * @param field * the field this candidate generator operates on. */ public static DirectCandidateGenerator candidateGenerator(String field) { return new DirectCandidateGenerator(field); } /** * A "stupid-backoff" smoothing model simialr to <a * href="http://en.wikipedia.org/wiki/Katz's_back-off_model"> Katz's * Backoff</a>. This model is used as the default if no model is configured. * <p> * See <a * href="http://en.wikipedia.org/wiki/N-gram#Smoothing_techniques">N-Gram * Smoothing</a> for details. * </p> */ public static final class StupidBackoff extends SmoothingModel { private final double discount; /** * Creates a Stupid-Backoff smoothing model. * * @param discount * the discount given to lower order ngrams if the higher order ngram doesn't exits */ public StupidBackoff(double discount) { super("stupid_backoff"); this.discount = discount; } @Override protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { builder.field("discount", discount); return builder; } } /** * An <a href="http://en.wikipedia.org/wiki/Additive_smoothing">additive * smoothing</a> model. * <p> * See <a * href="http://en.wikipedia.org/wiki/N-gram#Smoothing_techniques">N-Gram * Smoothing</a> for details. * </p> */ public static final class Laplace extends SmoothingModel { private final double alpha; /** * Creates a Laplace smoothing model. * */ public Laplace(double alpha) { super("laplace"); this.alpha = alpha; } @Override protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { builder.field("alpha", alpha); return builder; } } public static abstract class SmoothingModel implements ToXContent { private final String type; protected SmoothingModel(String type) { this.type = type; } @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(type); innerToXContent(builder,params); builder.endObject(); return builder; } protected abstract XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException; } /** * Linear interpolation smoothing model. * <p> * See <a * href="http://en.wikipedia.org/wiki/N-gram#Smoothing_techniques">N-Gram * Smoothing</a> for details. * </p> */ public static final class LinearInterpolation extends SmoothingModel { private final double trigramLambda; private final double bigramLambda; private final double unigramLambda; /** * Creates a linear interpolation smoothing model. * * Note: the lambdas must sum up to one. * * @param trigramLambda * the trigram lambda * @param bigramLambda * the bigram lambda * @param unigramLambda * the unigram lambda */ public LinearInterpolation(double trigramLambda, double bigramLambda, double unigramLambda) { super("linear"); this.trigramLambda = trigramLambda; this.bigramLambda = bigramLambda; this.unigramLambda = unigramLambda; } @Override protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { builder.field("trigram_lambda", trigramLambda); builder.field("bigram_lambda", bigramLambda); builder.field("unigram_lambda", unigramLambda); return builder; } } /** * {@link CandidateGenerator} base class. */ public static abstract class CandidateGenerator implements ToXContent { private final String type; public CandidateGenerator(String type) { this.type = type; } public String getType() { return type; } } /** * * */ public static final class DirectCandidateGenerator extends CandidateGenerator { private final String field; private String preFilter; private String postFilter; private String suggestMode; private Float accuracy; private Integer size; private String sort; private String stringDistance; private Integer maxEdits; private Integer maxInspections; private Float maxTermFreq; private Integer prefixLength; private Integer minWordLength; private Float minDocFreq; /** * @param field Sets from what field to fetch the candidate suggestions from. */ public DirectCandidateGenerator(String field) { super("direct_generator"); this.field = field; } /** * The global suggest mode controls what suggested terms are included or * controls for what suggest text tokens, terms should be suggested for. * Three possible values can be specified: * <ol> * <li><code>missing</code> - Only suggest terms in the suggest text * that aren't in the index. This is the default. * <li><code>popular</code> - Only suggest terms that occur in more docs * then the original suggest text term. * <li><code>always</code> - Suggest any matching suggest terms based on * tokens in the suggest text. * </ol> */ public DirectCandidateGenerator suggestMode(String suggestMode) { this.suggestMode = suggestMode; return this; } /** * Sets how similar the suggested terms at least need to be compared to * the original suggest text tokens. A value between 0 and 1 can be * specified. This value will be compared to the string distance result * of each candidate spelling correction. * <p> * Default is <tt>0.5</tt> */ public DirectCandidateGenerator accuracy(float accuracy) { this.accuracy = accuracy; return this; } /** * Sets the maximum suggestions to be returned per suggest text term. */ public DirectCandidateGenerator size(int size) { if (size <= 0) { throw new IllegalArgumentException("Size must be positive"); } this.size = size; return this; } /** * Sets how to sort the suggest terms per suggest text token. Two * possible values: * <ol> * <li><code>score</code> - Sort should first be based on score, then * document frequency and then the term itself. * <li><code>frequency</code> - Sort should first be based on document * frequency, then scotr and then the term itself. * </ol> * <p> * What the score is depends on the suggester being used. */ public DirectCandidateGenerator sort(String sort) { this.sort = sort; return this; } /** * Sets what string distance implementation to use for comparing how * similar suggested terms are. Four possible values can be specified: * <ol> * <li><code>internal</code> - This is the default and is based on * <code>damerau_levenshtein</code>, but highly optimized for comparing * string distance for terms inside the index. * <li><code>damerau_levenshtein</code> - String distance algorithm * based on Damerau-Levenshtein algorithm. * <li><code>levenstein</code> - String distance algorithm based on * Levenstein edit distance algorithm. * <li><code>jarowinkler</code> - String distance algorithm based on * Jaro-Winkler algorithm. * <li><code>ngram</code> - String distance algorithm based on character * n-grams. * </ol> */ public DirectCandidateGenerator stringDistance(String stringDistance) { this.stringDistance = stringDistance; return this; } /** * Sets the maximum edit distance candidate suggestions can have in * order to be considered as a suggestion. Can only be a value between 1 * and 2. Any other value result in an bad request error being thrown. * Defaults to <tt>2</tt>. */ public DirectCandidateGenerator maxEdits(Integer maxEdits) { this.maxEdits = maxEdits; return this; } /** * A factor that is used to multiply with the size in order to inspect * more candidate suggestions. Can improve accuracy at the cost of * performance. Defaults to <tt>5</tt>. */ public DirectCandidateGenerator maxInspections(Integer maxInspections) { this.maxInspections = maxInspections; return this; } /** * Sets a maximum threshold in number of documents a suggest text token * can exist in order to be corrected. Can be a relative percentage * number (e.g 0.4) or an absolute number to represent document * frequencies. If an value higher than 1 is specified then fractional * can not be specified. Defaults to <tt>0.01</tt>. * <p> * This can be used to exclude high frequency terms from being * suggested. High frequency terms are usually spelled correctly on top * of this this also improves the suggest performance. */ public DirectCandidateGenerator maxTermFreq(float maxTermFreq) { this.maxTermFreq = maxTermFreq; return this; } /** * Sets the number of minimal prefix characters that must match in order * be a candidate suggestion. Defaults to 1. Increasing this number * improves suggest performance. Usually misspellings don't occur in the * beginning of terms. */ public DirectCandidateGenerator prefixLength(int prefixLength) { this.prefixLength = prefixLength; return this; } /** * The minimum length a suggest text term must have in order to be * corrected. Defaults to <tt>4</tt>. */ public DirectCandidateGenerator minWordLength(int minWordLength) { this.minWordLength = minWordLength; return this; } /** * Sets a minimal threshold in number of documents a suggested term * should appear in. This can be specified as an absolute number or as a * relative percentage of number of documents. This can improve quality * by only suggesting high frequency terms. Defaults to 0f and is not * enabled. If a value higher than 1 is specified then the number cannot * be fractional. */ public DirectCandidateGenerator minDocFreq(float minDocFreq) { this.minDocFreq = minDocFreq; return this; } /** * Sets a filter (analyzer) that is applied to each of the tokens passed to this candidate generator. * This filter is applied to the original token before candidates are generated. */ public DirectCandidateGenerator preFilter(String preFilter) { this.preFilter = preFilter; return this; } /** * Sets a filter (analyzer) that is applied to each of the generated tokens * before they are passed to the actual phrase scorer. */ public DirectCandidateGenerator postFilter(String postFilter) { this.postFilter = postFilter; return this; } @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); if (field != null) { builder.field("field", field); } if (suggestMode != null) { builder.field("suggest_mode", suggestMode); } if (accuracy != null) { builder.field("accuracy", accuracy); } if (size != null) { builder.field("size", size); } if (sort != null) { builder.field("sort", sort); } if (stringDistance != null) { builder.field("string_distance", stringDistance); } if (maxEdits != null) { builder.field("max_edits", maxEdits); } if (maxInspections != null) { builder.field("max_inspections", maxInspections); } if (maxTermFreq != null) { builder.field("max_term_freq", maxTermFreq); } if (prefixLength != null) { builder.field("prefix_length", prefixLength); } if (minWordLength != null) { builder.field("min_word_length", minWordLength); } if (minDocFreq != null) { builder.field("min_doc_freq", minDocFreq); } if (preFilter != null) { builder.field("pre_filter", preFilter); } if (postFilter != null) { builder.field("post_filter", postFilter); } builder.endObject(); return builder; } } }