/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.query; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.action.termvectors.TermVectorsRequest; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseFieldMatcher; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.lucene.uid.Versions; import org.elasticsearch.common.xcontent.*; import org.elasticsearch.index.VersionType; import java.io.IOException; import java.util.*; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; /** * A more like this query that finds documents that are "like" the provided set of document(s). * * The documents are provided as a set of strings and/or a list of {@link Item}. */ public class MoreLikeThisQueryBuilder extends QueryBuilder implements BoostableQueryBuilder<MoreLikeThisQueryBuilder> { /** * A single item to be used for a {@link MoreLikeThisQueryBuilder}. */ public static final class Item implements ToXContent { public static final Item[] EMPTY_ARRAY = new Item[0]; public interface Field { ParseField INDEX = new ParseField("_index"); ParseField TYPE = new ParseField("_type"); ParseField ID = new ParseField("_id"); ParseField DOC = new ParseField("doc"); ParseField FIELDS = new ParseField("fields"); ParseField PER_FIELD_ANALYZER = new ParseField("per_field_analyzer"); ParseField ROUTING = new ParseField("_routing"); ParseField VERSION = new ParseField("_version"); ParseField VERSION_TYPE = new ParseField("_version_type"); } private String index; private String type; private String id; private BytesReference doc; private String[] fields; private Map<String, String> perFieldAnalyzer; private String routing; private long version = Versions.MATCH_ANY; private VersionType versionType = VersionType.INTERNAL; public Item() { } /** * Constructor for a given item / document request * * @param index the index where the document is located * @param type the type of the document * @param id and its id */ public Item(String index, @Nullable String type, String id) { this.index = index; this.type = type; this.id = id; } /** * Constructor for an artificial document request, that is not present in the index. * * @param index the index to be used for parsing the doc * @param type the type to be used for parsing the doc * @param doc the document specification */ public Item(String index, String type, XContentBuilder doc) { this.index = index; this.type = type; this.doc(doc); } public String index() { return index; } public Item index(String index) { this.index = index; return this; } public String type() { return type; } public Item type(String type) { this.type = type; return this; } public String id() { return id; } public Item id(String id) { this.id = id; return this; } public BytesReference doc() { return doc; } /** * Sets to a given artificial document, that is a document that is not present in the index. */ public Item doc(BytesReference doc) { this.doc = doc; return this; } /** * Sets to a given artificial document, that is a document that is not present in the index. */ public Item doc(XContentBuilder doc) { return this.doc(doc.bytes()); } public String[] fields() { return fields; } public Item fields(String... fields) { this.fields = fields; return this; } public Map<String, String> perFieldAnalyzer() { return perFieldAnalyzer; } /** * Sets the analyzer(s) to use at any given field. */ public Item perFieldAnalyzer(Map<String, String> perFieldAnalyzer) { this.perFieldAnalyzer = perFieldAnalyzer; return this; } public String routing() { return routing; } public Item routing(String routing) { this.routing = routing; return this; } public long version() { return version; } public Item version(long version) { this.version = version; return this; } public VersionType versionType() { return versionType; } public Item versionType(VersionType versionType) { this.versionType = versionType; return this; } /** * Convert this to a {@link TermVectorsRequest} for fetching the terms of the document. */ public TermVectorsRequest toTermVectorsRequest() { TermVectorsRequest termVectorsRequest = new TermVectorsRequest(index, type, id) .selectedFields(fields) .routing(routing) .version(version) .versionType(versionType) .perFieldAnalyzer(perFieldAnalyzer) .positions(false) // ensures these following parameters are never set .offsets(false) .payloads(false) .fieldStatistics(false) .termStatistics(false) .dfs(false); // for artificial docs to make sure that the id has changed in the item too if (doc != null) { termVectorsRequest.doc(doc, true); this.id(termVectorsRequest.id()); } return termVectorsRequest; } /** * Parses and returns the given item. */ public static Item parse(XContentParser parser, ParseFieldMatcher parseFieldMatcher, Item item) throws IOException { XContentParser.Token token; String currentFieldName = null; while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { currentFieldName = parser.currentName(); } else if (currentFieldName != null) { if (parseFieldMatcher.match(currentFieldName, Field.INDEX)) { item.index = parser.text(); } else if (parseFieldMatcher.match(currentFieldName, Field.TYPE)) { item.type = parser.text(); } else if (parseFieldMatcher.match(currentFieldName, Field.ID)) { item.id = parser.text(); } else if (parseFieldMatcher.match(currentFieldName, Field.DOC)) { item.doc(jsonBuilder().copyCurrentStructure(parser)); } else if (parseFieldMatcher.match(currentFieldName, Field.FIELDS)) { if (token == XContentParser.Token.START_ARRAY) { List<String> fields = new ArrayList<>(); while (parser.nextToken() != XContentParser.Token.END_ARRAY) { fields.add(parser.text()); } item.fields(fields.toArray(new String[fields.size()])); } else { throw new ElasticsearchParseException( "failed to parse More Like This item. field [fields] must be an array"); } } else if (parseFieldMatcher.match(currentFieldName, Field.PER_FIELD_ANALYZER)) { item.perFieldAnalyzer(TermVectorsRequest.readPerFieldAnalyzer(parser.map())); } else if ("_routing".equals(currentFieldName) || "routing".equals(currentFieldName)) { item.routing = parser.text(); } else if ("_version".equals(currentFieldName) || "version".equals(currentFieldName)) { item.version = parser.longValue(); } else if ("_version_type".equals(currentFieldName) || "_versionType".equals(currentFieldName) || "version_type".equals(currentFieldName) || "versionType".equals(currentFieldName)) { item.versionType = VersionType.fromString(parser.text()); } else { throw new ElasticsearchParseException( "failed to parse More Like This item. unknown field [{}]", currentFieldName); } } } if (item.id != null && item.doc != null) { throw new ElasticsearchParseException( "failed to parse More Like This item. either [id] or [doc] can be specified, but not both!"); } return item; } @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); if (this.index != null) { builder.field(Field.INDEX.getPreferredName(), this.index); } if (this.type != null) { builder.field(Field.TYPE.getPreferredName(), this.type); } if (this.id != null && this.doc == null) { builder.field(Field.ID.getPreferredName(), this.id); } if (this.doc != null) { XContentType contentType = XContentFactory.xContentType(this.doc); if (contentType == builder.contentType()) { builder.rawField(Field.DOC.getPreferredName(), this.doc); } else { XContentParser parser = XContentFactory.xContent(contentType).createParser(this.doc); parser.nextToken(); builder.field(Field.DOC.getPreferredName()); builder.copyCurrentStructure(parser); } } if (this.fields != null) { builder.array(Field.FIELDS.getPreferredName(), this.fields); } if (this.perFieldAnalyzer != null) { builder.field(Field.PER_FIELD_ANALYZER.getPreferredName(), this.perFieldAnalyzer); } if (this.routing != null) { builder.field(Field.ROUTING.getPreferredName(), this.routing); } if (this.version != Versions.MATCH_ANY) { builder.field(Field.VERSION.getPreferredName(), this.version); } if (this.versionType != VersionType.INTERNAL) { builder.field(Field.VERSION_TYPE.getPreferredName(), this.versionType.toString().toLowerCase(Locale.ROOT)); } return builder.endObject(); } @Override public final String toString() { try { XContentBuilder builder = XContentFactory.jsonBuilder(); builder.prettyPrint(); toXContent(builder, EMPTY_PARAMS); return builder.string(); } catch (Exception e) { return "{ \"error\" : \"" + ExceptionsHelper.detailedMessage(e) + "\"}"; } } @Override public int hashCode() { return Objects.hash(index, type, id, doc, Arrays.hashCode(fields), perFieldAnalyzer, routing, version, versionType); } @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Item)) return false; Item other = (Item) o; return Objects.equals(index, other.index) && Objects.equals(type, other.type) && Objects.equals(id, other.id) && Objects.equals(doc, other.doc) && Arrays.equals(fields, other.fields) && // otherwise we are comparing pointers Objects.equals(perFieldAnalyzer, other.perFieldAnalyzer) && Objects.equals(routing, other.routing) && Objects.equals(version, other.version) && Objects.equals(versionType, other.versionType); } } // document inputs private List<String> likeTexts = new ArrayList<>(); private List<String> unlikeTexts = new ArrayList<>(); private List<Item> likeItems = new ArrayList<>(); private List<Item> unlikeItems = new ArrayList<>(); private final String[] fields; // term selection parameters private int maxQueryTerms = -1; private int minTermFreq = -1; private int minDocFreq = -1; private int maxDocFreq = -1; private int minWordLength = -1; private int maxWordLength = -1; private String[] stopWords = null; private String analyzer; // query formation parameters private String minimumShouldMatch = null; private float boostTerms = -1; private Boolean include = null; // other parameters private Boolean failOnUnsupportedField; private float boost = -1; private String queryName; /** * Constructs a new more like this query which uses the "_all" field. */ public MoreLikeThisQueryBuilder() { this.fields = null; } /** * Sets the field names that will be used when generating the 'More Like This' query. * * @param fields the field names that will be used when generating the 'More Like This' query. */ public MoreLikeThisQueryBuilder(String... fields) { this.fields = fields; } /** * Sets the text to use in order to find documents that are "like" this. * * @param likeTexts the text to use when generating the 'More Like This' query. */ public MoreLikeThisQueryBuilder like(String... likeTexts) { this.likeTexts = new ArrayList<>(); return addLikeText(likeTexts); } /** * Sets the documents to use in order to find documents that are "like" this. * * @param likeItems the documents to use when generating the 'More Like This' query. */ public MoreLikeThisQueryBuilder like(Item... likeItems) { this.likeItems = new ArrayList<>(); return addLikeItem(likeItems); } /** * Adds some text to use in order to find documents that are "like" this. */ public MoreLikeThisQueryBuilder addLikeText(String... likeTexts) { Collections.addAll(this.likeTexts, likeTexts); return this; } /** * Adds a document to use in order to find documents that are "like" this. */ public MoreLikeThisQueryBuilder addLikeItem(Item... likeItems) { Collections.addAll(this.likeItems, likeItems); return this; } /** * Sets the text from which the terms should not be selected from. */ public MoreLikeThisQueryBuilder unlike(String... unlikeTexts) { this.unlikeTexts = new ArrayList<>(); return addUnlikeText(unlikeTexts); } /** * Sets the documents from which the terms should not be selected from. */ public MoreLikeThisQueryBuilder unlike(Item... unlikeItems) { this.unlikeItems = new ArrayList<>(); return addUnlikeItem(unlikeItems); } /** * Adds some text to use in order to find documents that are "unlike" this. */ public MoreLikeThisQueryBuilder addUnlikeText(String... unlikeTexts) { Collections.addAll(this.unlikeTexts, unlikeTexts); return this; } /** * Adds a document to use in order to find documents that are "unlike" this. */ public MoreLikeThisQueryBuilder addUnlikeItem(Item... unlikeItems) { Collections.addAll(this.unlikeItems, unlikeItems); return this; } /** * Sets the maximum number of query terms that will be included in any generated query. * Defaults to <tt>25</tt>. */ public MoreLikeThisQueryBuilder maxQueryTerms(int maxQueryTerms) { this.maxQueryTerms = maxQueryTerms; return this; } /** * The frequency below which terms will be ignored in the source doc. The default * frequency is <tt>2</tt>. */ public MoreLikeThisQueryBuilder minTermFreq(int minTermFreq) { this.minTermFreq = minTermFreq; return this; } /** * Sets the frequency at which words will be ignored which do not occur in at least this * many docs. Defaults to <tt>5</tt>. */ public MoreLikeThisQueryBuilder minDocFreq(int minDocFreq) { this.minDocFreq = minDocFreq; return this; } /** * Set the maximum frequency in which words may still appear. Words that appear * in more than this many docs will be ignored. Defaults to unbounded. */ public MoreLikeThisQueryBuilder maxDocFreq(int maxDocFreq) { this.maxDocFreq = maxDocFreq; return this; } /** * Sets the minimum word length below which words will be ignored. Defaults * to <tt>0</tt>. */ public MoreLikeThisQueryBuilder minWordLength(int minWordLength) { this.minWordLength = minWordLength; return this; } /** * Sets the maximum word length above which words will be ignored. Defaults to * unbounded (<tt>0</tt>). */ public MoreLikeThisQueryBuilder maxWordLength(int maxWordLength) { this.maxWordLength = maxWordLength; return this; } /** * Set the set of stopwords. * <p> * Any word in this set is considered "uninteresting" and ignored. Even if your Analyzer allows stopwords, you * might want to tell the MoreLikeThis code to ignore them, as for the purposes of document similarity it seems * reasonable to assume that "a stop word is never interesting". */ public MoreLikeThisQueryBuilder stopWords(String... stopWords) { this.stopWords = stopWords; return this; } /** * The analyzer that will be used to analyze the text. Defaults to the analyzer associated with the fied. */ public MoreLikeThisQueryBuilder analyzer(String analyzer) { this.analyzer = analyzer; return this; } /** * Number of terms that must match the generated query expressed in the * common syntax for minimum should match. Defaults to <tt>30%</tt>. * * @see org.elasticsearch.common.lucene.search.Queries#calculateMinShouldMatch(int, String) */ public MoreLikeThisQueryBuilder minimumShouldMatch(String minimumShouldMatch) { this.minimumShouldMatch = minimumShouldMatch; return this; } /** * Sets the boost factor to use when boosting terms. Defaults to <tt>1</tt>. */ public MoreLikeThisQueryBuilder boostTerms(float boostTerms) { this.boostTerms = boostTerms; return this; } /** * Whether to include the input documents. Defaults to <tt>false</tt> */ public MoreLikeThisQueryBuilder include(boolean include) { this.include = include; return this; } /** * Whether to fail or return no result when this query is run against a field which is not supported such as binary/numeric fields. */ public MoreLikeThisQueryBuilder failOnUnsupportedField(boolean fail) { failOnUnsupportedField = fail; return this; } @Override public MoreLikeThisQueryBuilder boost(float boost) { this.boost = boost; return this; } /** * Sets the query name for the filter that can be used when searching for matched_filters per hit. */ public MoreLikeThisQueryBuilder queryName(String queryName) { this.queryName = queryName; return this; } /** * The text to use in order to find documents that are "like" this. */ @Deprecated public MoreLikeThisQueryBuilder likeText(String likeText) { return like(likeText); } @Deprecated public MoreLikeThisQueryBuilder ids(String... ids) { Item[] items = new Item[ids.length]; for (int i = 0; i < items.length; i++) { items[i] = new Item(null, null, ids[i]); } return like(items); } @Deprecated public MoreLikeThisQueryBuilder docs(Item... docs) { return like(docs); } /** * Sets the documents from which the terms should not be selected from. * * @deprecated Use {@link #unlike(Item...)} instead */ @Deprecated public MoreLikeThisQueryBuilder ignoreLike(Item... docs) { return unlike(docs); } /** * Sets the text from which the terms should not be selected from. * * @deprecated Use {@link #unlike(String...)} instead. */ @Deprecated public MoreLikeThisQueryBuilder ignoreLike(String... likeText) { return unlike(likeText); } /** * Adds a document to use in order to find documents that are "like" this. */ @Deprecated public MoreLikeThisQueryBuilder addItem(Item... likeItems) { return addLikeItem(likeItems); } @Override protected void doXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(MoreLikeThisQueryParser.NAME); if (fields != null) { builder.field(MoreLikeThisQueryParser.Field.FIELDS.getPreferredName(), fields); } if (this.likeTexts.isEmpty() && this.likeItems.isEmpty()) { throw new IllegalArgumentException("more_like_this requires '" + MoreLikeThisQueryParser.Field.LIKE.getPreferredName() + "' to be provided"); } else { buildLikeField(builder, MoreLikeThisQueryParser.Field.LIKE.getPreferredName(), likeTexts, likeItems); } if (!unlikeTexts.isEmpty() || !unlikeItems.isEmpty()) { buildLikeField(builder, MoreLikeThisQueryParser.Field.UNLIKE.getPreferredName(), unlikeTexts, unlikeItems); } if (maxQueryTerms != -1) { builder.field(MoreLikeThisQueryParser.Field.MAX_QUERY_TERMS.getPreferredName(), maxQueryTerms); } if (minTermFreq != -1) { builder.field(MoreLikeThisQueryParser.Field.MIN_TERM_FREQ.getPreferredName(), minTermFreq); } if (minDocFreq != -1) { builder.field(MoreLikeThisQueryParser.Field.MIN_DOC_FREQ.getPreferredName(), minDocFreq); } if (maxDocFreq != -1) { builder.field(MoreLikeThisQueryParser.Field.MAX_DOC_FREQ.getPreferredName(), maxDocFreq); } if (minWordLength != -1) { builder.field(MoreLikeThisQueryParser.Field.MIN_WORD_LENGTH.getPreferredName(), minWordLength); } if (maxWordLength != -1) { builder.field(MoreLikeThisQueryParser.Field.MAX_WORD_LENGTH.getPreferredName(), maxWordLength); } if (stopWords != null && stopWords.length > 0) { builder.field(MoreLikeThisQueryParser.Field.STOP_WORDS.getPreferredName(), stopWords); } if (analyzer != null) { builder.field(MoreLikeThisQueryParser.Field.ANALYZER.getPreferredName(), analyzer); } if (minimumShouldMatch != null) { builder.field(MoreLikeThisQueryParser.Field.MINIMUM_SHOULD_MATCH.getPreferredName(), minimumShouldMatch); } if (boostTerms != -1) { builder.field(MoreLikeThisQueryParser.Field.BOOST_TERMS.getPreferredName(), boostTerms); } if (include != null) { builder.field(MoreLikeThisQueryParser.Field.INCLUDE.getPreferredName(), include); } if (failOnUnsupportedField != null) { builder.field(MoreLikeThisQueryParser.Field.FAIL_ON_UNSUPPORTED_FIELD.getPreferredName(), failOnUnsupportedField); } if (boost != -1) { builder.field("boost", boost); } if (queryName != null) { builder.field("_name", queryName); } builder.endObject(); } private static void buildLikeField(XContentBuilder builder, String fieldName, List<String> texts, List<Item> items) throws IOException { builder.startArray(fieldName); for (String text : texts) { builder.value(text); } for (Item item : items) { builder.value(item); } builder.endArray(); } }