/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.action.termvectors.MultiTermVectorsItemResponse;
import org.elasticsearch.action.termvectors.MultiTermVectorsRequest;
import org.elasticsearch.action.termvectors.MultiTermVectorsResponse;
import org.elasticsearch.action.termvectors.TermVectorsRequest;
import org.elasticsearch.action.termvectors.TermVectorsResponse;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.lucene.search.MoreLikeThisQuery;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.elasticsearch.index.VersionType;
import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.test.AbstractQueryTestCase;
import org.junit.Before;
import java.io.IOException;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import static org.elasticsearch.index.query.QueryBuilders.moreLikeThisQuery;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
public class MoreLikeThisQueryBuilderTests extends AbstractQueryTestCase<MoreLikeThisQueryBuilder> {
private static final String[] SHUFFLE_PROTECTED_FIELDS = new String[]{Item.Field.DOC.getPreferredName()};
private static String[] randomFields;
private static Item[] randomLikeItems;
private static Item[] randomUnlikeItems;
@Before
public void setup() {
// MLT only supports string fields, unsupported fields are tested below
randomFields = randomStringFields();
// we also preset the item requests
randomLikeItems = new Item[randomIntBetween(1, 3)];
for (int i = 0; i < randomLikeItems.length; i++) {
randomLikeItems[i] = generateRandomItem();
}
// and for the unlike items too
randomUnlikeItems = new Item[randomIntBetween(1, 3)];
for (int i = 0; i < randomUnlikeItems.length; i++) {
randomUnlikeItems[i] = generateRandomItem();
}
}
private static String[] randomStringFields() {
String[] mappedStringFields = new String[]{STRING_FIELD_NAME, STRING_FIELD_NAME_2};
String[] unmappedStringFields = generateRandomStringArray(2, 5, false, false);
return Stream.concat(Arrays.stream(mappedStringFields), Arrays.stream(unmappedStringFields)).toArray(String[]::new);
}
private Item generateRandomItem() {
String index = randomBoolean() ? getIndex().getName() : null;
String type = getRandomType(); // set to one type to avoid ambiguous types
// indexed item or artificial document
Item item;
if (randomBoolean()) {
item = new Item(index, type, randomAlphaOfLength(10));
} else {
item = new Item(index, type, randomArtificialDoc());
}
// if no field is specified MLT uses all mapped fields for this item
if (randomBoolean()) {
item.fields(randomFrom(randomFields));
}
// per field analyzer
if (randomBoolean()) {
item.perFieldAnalyzer(randomPerFieldAnalyzer());
}
if (randomBoolean()) {
item.routing(randomAlphaOfLength(10));
}
if (randomBoolean()) {
item.version(randomInt(5));
}
if (randomBoolean()) {
item.versionType(randomFrom(VersionType.values()));
}
return item;
}
private XContentBuilder randomArtificialDoc() {
XContentBuilder doc;
try {
doc = XContentFactory.jsonBuilder().startObject();
for (String field : randomFields) {
doc.field(field, randomAlphaOfLength(10));
}
doc.endObject();
} catch (IOException e) {
throw new ElasticsearchException("Unable to generate random artificial doc!");
}
return doc;
}
private Map<String, String> randomPerFieldAnalyzer() {
Map<String, String> perFieldAnalyzer = new HashMap<>();
for (String field : randomFields) {
perFieldAnalyzer.put(field, randomAnalyzer());
}
return perFieldAnalyzer;
}
@Override
protected MoreLikeThisQueryBuilder doCreateTestQueryBuilder() {
MoreLikeThisQueryBuilder queryBuilder;
String[] likeTexts = null;
Item[] likeItems = null;
// like field is required
if (randomBoolean()) {
likeTexts = generateRandomStringArray(5, 5, false, false);
} else {
likeItems = randomLikeItems;
}
if (randomBoolean()) { // for the default field
queryBuilder = new MoreLikeThisQueryBuilder(likeTexts, likeItems);
} else {
queryBuilder = new MoreLikeThisQueryBuilder(randomFields, likeTexts, likeItems);
}
if (randomBoolean()) {
queryBuilder.unlike(generateRandomStringArray(5, 5, false, false));
}
if (randomBoolean()) {
queryBuilder.unlike(randomUnlikeItems);
}
if (randomBoolean()) {
queryBuilder.maxQueryTerms(randomInt(25));
}
if (randomBoolean()) {
queryBuilder.minTermFreq(randomInt(5));
}
if (randomBoolean()) {
queryBuilder.minDocFreq(randomInt(5));
}
if (randomBoolean()) {
queryBuilder.maxDocFreq(randomInt(100));
}
if (randomBoolean()) {
queryBuilder.minWordLength(randomInt(5));
}
if (randomBoolean()) {
queryBuilder.maxWordLength(randomInt(25));
}
if (randomBoolean()) {
queryBuilder.stopWords(generateRandomStringArray(5, 5, false, false));
}
if (randomBoolean()) {
queryBuilder.analyzer(randomAnalyzer()); // fix the analyzer?
}
if (randomBoolean()) {
queryBuilder.minimumShouldMatch(randomMinimumShouldMatch());
}
if (randomBoolean()) {
queryBuilder.boostTerms(randomFloat() * 10);
}
if (randomBoolean()) {
queryBuilder.include(randomBoolean());
}
if (randomBoolean()) {
queryBuilder.failOnUnsupportedField(randomBoolean());
}
return queryBuilder;
}
/**
* we don't want to shuffle the "doc" field internally in {@link #testFromXContent()} because even though the
* documents would be functionally the same, their {@link BytesReference} representation isn't and thats what we
* compare when check for equality of the original and the shuffled builder
*/
@Override
protected String[] shuffleProtectedFields() {
return SHUFFLE_PROTECTED_FIELDS;
}
@Override
protected Set<String> getObjectsHoldingArbitraryContent() {
//doc contains arbitrary content, anything can be added to it and no exception will be thrown
return Collections.singleton(MoreLikeThisQueryBuilder.Item.Field.DOC.getPreferredName());
}
@Override
protected MultiTermVectorsResponse executeMultiTermVectors(MultiTermVectorsRequest mtvRequest) {
try {
MultiTermVectorsItemResponse[] responses = new MultiTermVectorsItemResponse[mtvRequest.size()];
int i = 0;
for (TermVectorsRequest request : mtvRequest) {
TermVectorsResponse response = new TermVectorsResponse(request.index(), request.type(), request.id());
response.setExists(true);
Fields generatedFields;
if (request.doc() != null) {
generatedFields = generateFields(randomFields, request.doc().utf8ToString());
} else {
generatedFields = generateFields(request.selectedFields().toArray(new String[request.selectedFields().size()]), request.id());
}
EnumSet<TermVectorsRequest.Flag> flags = EnumSet.of(TermVectorsRequest.Flag.Positions, TermVectorsRequest.Flag.Offsets);
response.setFields(generatedFields, request.selectedFields(), flags, generatedFields);
responses[i++] = new MultiTermVectorsItemResponse(response, null);
}
return new MultiTermVectorsResponse(responses);
} catch (IOException ex) {
throw new ElasticsearchException("boom", ex);
}
}
/**
* Here we could go overboard and use a pre-generated indexed random document for a given Item,
* but for now we'd prefer to simply return the id as the content of the document and that for
* every field.
*/
private static Fields generateFields(String[] fieldNames, String text) throws IOException {
MemoryIndex index = new MemoryIndex();
for (String fieldName : fieldNames) {
index.addField(fieldName, text, new WhitespaceAnalyzer());
}
return MultiFields.getFields(index.createSearcher().getIndexReader());
}
@Override
protected void doAssertLuceneQuery(MoreLikeThisQueryBuilder queryBuilder, Query query, SearchContext context) throws IOException {
if (queryBuilder.likeItems() != null && queryBuilder.likeItems().length > 0) {
assertThat(query, instanceOf(BooleanQuery.class));
} else {
// we rely on integration tests for a deeper check here
assertThat(query, instanceOf(MoreLikeThisQuery.class));
}
}
public void testValidateEmptyFields() {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> new MoreLikeThisQueryBuilder(new String[0], new String[]{"likeText"}, null));
assertThat(e.getMessage(), containsString("requires 'fields' to be specified"));
}
public void testValidateEmptyLike() {
String[] likeTexts = randomBoolean() ? null : new String[0];
Item[] likeItems = randomBoolean() ? null : new Item[0];
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> new MoreLikeThisQueryBuilder(likeTexts, likeItems));
assertThat(e.getMessage(), containsString("requires either 'like' texts or items to be specified"));
}
public void testUnsupportedFields() throws IOException {
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
String unsupportedField = randomFrom(INT_FIELD_NAME, DOUBLE_FIELD_NAME, DATE_FIELD_NAME);
MoreLikeThisQueryBuilder queryBuilder = new MoreLikeThisQueryBuilder(new String[] {unsupportedField}, new String[]{"some text"}, null)
.failOnUnsupportedField(true);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> queryBuilder.toQuery(createShardContext()));
assertThat(e.getMessage(), containsString("more_like_this only supports text/keyword fields"));
}
public void testMoreLikeThisBuilder() throws Exception {
Query parsedQuery = parseQuery(moreLikeThisQuery(new String[]{"name.first", "name.last"}, new String[]{"something"}, null).minTermFreq(1).maxQueryTerms(12)).toQuery(createShardContext());
assertThat(parsedQuery, instanceOf(MoreLikeThisQuery.class));
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery;
assertThat(mltQuery.getMoreLikeFields()[0], equalTo("name.first"));
assertThat(mltQuery.getLikeText(), equalTo("something"));
assertThat(mltQuery.getMinTermFrequency(), equalTo(1));
assertThat(mltQuery.getMaxQueryTerms(), equalTo(12));
}
public void testItemSerialization() throws IOException {
Item expectedItem = generateRandomItem();
BytesStreamOutput output = new BytesStreamOutput();
expectedItem.writeTo(output);
Item newItem = new Item(output.bytes().streamInput());
assertEquals(expectedItem, newItem);
}
public void testItemFromXContent() throws IOException {
Item expectedItem = generateRandomItem();
String json = expectedItem.toXContent(XContentFactory.jsonBuilder(), ToXContent.EMPTY_PARAMS).string();
XContentParser parser = createParser(JsonXContent.jsonXContent, json);
Item newItem = Item.parse(parser, new Item());
assertEquals(expectedItem, newItem);
}
public void testItemSerializationBwc() throws IOException {
final byte[] data = Base64.getDecoder().decode("AQVpbmRleAEEdHlwZQEODXsiZm9vIjoiYmFyIn0A/wD//////////QAAAAAAAAAA");
final Version version = randomFrom(Version.V_5_0_0, Version.V_5_0_1, Version.V_5_0_2,
Version.V_5_0_3_UNRELEASED, Version.V_5_1_1_UNRELEASED, Version.V_5_1_2_UNRELEASED, Version.V_5_2_0_UNRELEASED);
try (StreamInput in = StreamInput.wrap(data)) {
in.setVersion(version);
Item item = new Item(in);
assertEquals(XContentType.JSON, item.xContentType());
assertEquals("{\"foo\":\"bar\"}", item.doc().utf8ToString());
assertEquals("index", item.index());
assertEquals("type", item.type());
try (BytesStreamOutput out = new BytesStreamOutput()) {
out.setVersion(version);
item.writeTo(out);
assertArrayEquals(data, out.bytes().toBytesRef().bytes);
}
}
}
@Override
protected boolean isCachable(MoreLikeThisQueryBuilder queryBuilder) {
return queryBuilder.likeItems().length == 0; // items are always fetched
}
public void testFromJson() throws IOException {
String json =
"{\n" +
" \"more_like_this\" : {\n" +
" \"fields\" : [ \"title\", \"description\" ],\n" +
" \"like\" : [ \"and potentially some more text here as well\", {\n" +
" \"_index\" : \"imdb\",\n" +
" \"_type\" : \"movies\",\n" +
" \"_id\" : \"1\"\n" +
" }, {\n" +
" \"_index\" : \"imdb\",\n" +
" \"_type\" : \"movies\",\n" +
" \"_id\" : \"2\"\n" +
" } ],\n" +
" \"max_query_terms\" : 12,\n" +
" \"min_term_freq\" : 1,\n" +
" \"min_doc_freq\" : 5,\n" +
" \"max_doc_freq\" : 2147483647,\n" +
" \"min_word_length\" : 0,\n" +
" \"max_word_length\" : 0,\n" +
" \"minimum_should_match\" : \"30%\",\n" +
" \"boost_terms\" : 0.0,\n" +
" \"include\" : false,\n" +
" \"fail_on_unsupported_field\" : true,\n" +
" \"boost\" : 1.0\n" +
" }\n" +
"}";
MoreLikeThisQueryBuilder parsed = (MoreLikeThisQueryBuilder) parseQuery(json);
checkGeneratedJson(json, parsed);
assertEquals(json, 2, parsed.fields().length);
assertEquals(json, "and potentially some more text here as well", parsed.likeTexts()[0]);
}
}