/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.action.termvectors; import com.carrotsearch.hppc.ObjectIntHashMap; import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Fields; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse; import org.elasticsearch.action.admin.indices.alias.Alias; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.uid.Versions; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.engine.VersionConflictEngineException; import org.elasticsearch.index.mapper.FieldMapper; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.stream.Collectors; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; public class GetTermVectorsIT extends AbstractTermVectorsTestCase { public void testNoSuchDoc() throws Exception { XContentBuilder mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("field") .field("type", "text") .field("term_vector", "with_positions_offsets_payloads") .endObject() .endObject() .endObject().endObject(); assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping)); client().prepareIndex("test", "type1", "666").setSource("field", "foo bar").execute().actionGet(); refresh(); for (int i = 0; i < 20; i++) { ActionFuture<TermVectorsResponse> termVector = client().termVectors(new TermVectorsRequest(indexOrAlias(), "type1", "" + i)); TermVectorsResponse actionGet = termVector.actionGet(); assertThat(actionGet, notNullValue()); assertThat(actionGet.getIndex(), equalTo("test")); assertThat(actionGet.isExists(), equalTo(false)); // check response is nevertheless serializable to json actionGet.toXContent(jsonBuilder(), ToXContent.EMPTY_PARAMS); } } public void testExistingFieldWithNoTermVectorsNoNPE() throws Exception { XContentBuilder mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("existingfield") .field("type", "text") .field("term_vector", "with_positions_offsets_payloads") .endObject() .endObject() .endObject().endObject(); assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping)); // when indexing a field that simply has a question mark, the term vectors will be null client().prepareIndex("test", "type1", "0").setSource("existingfield", "?").execute().actionGet(); refresh(); ActionFuture<TermVectorsResponse> termVector = client().termVectors(new TermVectorsRequest(indexOrAlias(), "type1", "0") .selectedFields(new String[]{"existingfield"})); // lets see if the null term vectors are caught... TermVectorsResponse actionGet = termVector.actionGet(); assertThat(actionGet, notNullValue()); assertThat(actionGet.isExists(), equalTo(true)); assertThat(actionGet.getIndex(), equalTo("test")); assertThat(actionGet.getFields().terms("existingfield"), nullValue()); } public void testExistingFieldButNotInDocNPE() throws Exception { XContentBuilder mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("existingfield") .field("type", "text") .field("term_vector", "with_positions_offsets_payloads") .endObject() .endObject() .endObject().endObject(); assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping)); // when indexing a field that simply has a question mark, the term vectors will be null client().prepareIndex("test", "type1", "0").setSource("anotherexistingfield", 1).execute().actionGet(); refresh(); ActionFuture<TermVectorsResponse> termVectors = client().termVectors(new TermVectorsRequest(indexOrAlias(), "type1", "0") .selectedFields(randomBoolean() ? new String[]{"existingfield"} : null) .termStatistics(true) .fieldStatistics(true)); // lets see if the null term vectors are caught... TermVectorsResponse actionGet = termVectors.actionGet(); assertThat(actionGet, notNullValue()); assertThat(actionGet.isExists(), equalTo(true)); assertThat(actionGet.getIndex(), equalTo("test")); assertThat(actionGet.getFields().terms("existingfield"), nullValue()); } public void testNotIndexedField() throws Exception { // must be of type string and indexed. assertAcked(prepareCreate("test") .addAlias(new Alias("alias")) .addMapping("type1", "field0", "type=integer,", // no tvs "field1", "type=text,index=false", // no tvs "field2", "type=text,index=false,store=true", // no tvs "field3", "type=text,index=false,term_vector=yes", // no tvs "field4", "type=keyword", // yes tvs "field5", "type=text,index=true")); // yes tvs List<IndexRequestBuilder> indexBuilders = new ArrayList<>(); for (int i = 0; i < 6; i++) { indexBuilders.add(client().prepareIndex() .setIndex("test") .setType("type1") .setId(String.valueOf(i)) .setSource("field" + i, i)); } indexRandom(true, indexBuilders); for (int i = 0; i < 4; i++) { TermVectorsResponse resp = client().prepareTermVectors(indexOrAlias(), "type1", String.valueOf(i)) .setSelectedFields("field" + i) .get(); assertThat(resp, notNullValue()); assertThat(resp.isExists(), equalTo(true)); assertThat(resp.getIndex(), equalTo("test")); assertThat("field" + i + " :", resp.getFields().terms("field" + i), nullValue()); } for (int i = 4; i < 6; i++) { TermVectorsResponse resp = client().prepareTermVectors(indexOrAlias(), "type1", String.valueOf(i)) .setSelectedFields("field" + i).get(); assertThat(resp.getIndex(), equalTo("test")); assertThat("field" + i + " :", resp.getFields().terms("field" + i), notNullValue()); } } public void testSimpleTermVectors() throws IOException { XContentBuilder mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("field") .field("type", "text") .field("term_vector", "with_positions_offsets_payloads") .field("analyzer", "tv_test") .endObject() .endObject() .endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping) .addAlias(new Alias("alias")) .setSettings(Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog") // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 31the34 35lazy39 40dog43 .endObject()).execute().actionGet(); refresh(); } for (int i = 0; i < 10; i++) { TermVectorsRequestBuilder resp = client().prepareTermVectors(indexOrAlias(), "type1", Integer.toString(i)).setPayloads(true) .setOffsets(true).setPositions(true).setSelectedFields(); TermVectorsResponse response = resp.execute().actionGet(); assertThat(response.getIndex(), equalTo("test")); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); checkBrownFoxTermVector(fields, "field", true); } } public void testRandomSingleTermVectors() throws IOException { FieldType ft = new FieldType(); int config = randomInt(6); boolean storePositions = false; boolean storeOffsets = false; boolean storePayloads = false; boolean storeTermVectors = false; switch (config) { case 0: { // do nothing break; } case 1: { storeTermVectors = true; break; } case 2: { storeTermVectors = true; storePositions = true; break; } case 3: { storeTermVectors = true; storeOffsets = true; break; } case 4: { storeTermVectors = true; storePositions = true; storeOffsets = true; break; } case 5: { storeTermVectors = true; storePositions = true; storePayloads = true; break; } case 6: { storeTermVectors = true; storePositions = true; storeOffsets = true; storePayloads = true; break; } } ft.setStoreTermVectors(storeTermVectors); ft.setStoreTermVectorOffsets(storeOffsets); ft.setStoreTermVectorPayloads(storePayloads); ft.setStoreTermVectorPositions(storePositions); String optionString = FieldMapper.termVectorOptionsToString(ft); XContentBuilder mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("field") .field("type", "text") .field("term_vector", optionString) .field("analyzer", "tv_test") .endObject() .endObject() .endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping) .setSettings(Settings.builder() .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog") // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 31the34 35lazy39 40dog43 .endObject()).execute().actionGet(); refresh(); } String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; boolean isPayloadRequested = randomBoolean(); boolean isOffsetRequested = randomBoolean(); boolean isPositionsRequested = randomBoolean(); String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString); for (int i = 0; i < 10; i++) { TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(isPayloadRequested).setOffsets(isOffsetRequested).setPositions(isPositionsRequested).setSelectedFields(); TermVectorsResponse response = resp.execute().actionGet(); assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0)); if (ft.storeTermVectors()) { Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8L)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(infoString, next, notNullValue()); assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString())); assertThat(infoString, next, notNullValue()); // do not test ttf or doc frequency, because here we have // many shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); // docs and pos only returns something if positions or // payloads or offsets are stored / requestd Otherwise use // DocsEnum? assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0)); assertThat(infoString, freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; if (isPositionsRequested && storePositions) { assertThat(infoString, termPos.length, equalTo(freq[j])); } if (isOffsetRequested && storeOffsets) { assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); } for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); // only return something useful if requested and stored if (isPositionsRequested && storePositions) { assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k])); } else { assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1)); } // only return something useful if requested and stored if (isPayloadRequested && storePayloads) { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef( "word"))); } else { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null)); } // only return something useful if requested and stored if (isOffsetRequested && storeOffsets) { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); } else { assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1)); assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1)); } } } assertThat(iterator.next(), nullValue()); } } } private String createInfoString(boolean isPositionsRequested, boolean isOffsetRequested, boolean isPayloadRequested, String optionString) { String ret = "Store config: " + optionString + "\n" + "Requested: pos-" + (isPositionsRequested ? "yes" : "no") + ", offsets-" + (isOffsetRequested ? "yes" : "no") + ", payload- " + (isPayloadRequested ? "yes" : "no") + "\n"; return ret; } public void testDuelESLucene() throws Exception { TestFieldSetting[] testFieldSettings = getFieldSettings(); createIndexBasedOnFieldSettings("test", "alias", testFieldSettings); //we generate as many docs as many shards we have TestDoc[] testDocs = generateTestDocs("test", testFieldSettings); DirectoryReader directoryReader = indexDocsWithLucene(testDocs); TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings); for (TestConfig test : testConfigs) { TermVectorsRequestBuilder request = getRequestForConfig(test); if (test.expectedException != null) { assertThrows(request, test.expectedException); continue; } TermVectorsResponse response = request.get(); Fields luceneTermVectors = getTermVectorsFromLucene(directoryReader, test.doc); validateResponse(response, luceneTermVectors, test); } } public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException { //create the test document int encoding = randomIntBetween(0, 2); String encodingString = ""; if (encoding == 0) { encodingString = "float"; } if (encoding == 1) { encodingString = "int"; } if (encoding == 2) { encodingString = "identity"; } String[] tokens = crateRandomTokens(); Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding); String delimiter = createRandomDelimiter(tokens); String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0)); //create the mapping XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties") .startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads") .field("analyzer", "payload_test").endObject().endObject().endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings( Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer.payload_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter") .put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter) .put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString) .put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter"))); client().prepareIndex("test", "type1", Integer.toString(1)) .setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet(); refresh(); TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true) .setPositions(true).setSelectedFields(); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); TermsEnum iterator = terms.iterator(); while (iterator.next() != null) { String term = iterator.term().utf8ToString(); PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); List<BytesRef> curPayloads = payloads.get(term); assertThat(term, curPayloads, notNullValue()); assertNotNull(docsAndPositions); for (int k = 0; k < docsAndPositions.freq(); k++) { docsAndPositions.nextPosition(); if (docsAndPositions.getPayload()!=null){ String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString(); assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k))); } else { String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString(); assertThat(infoString, curPayloads.get(k).length, equalTo(0)); } } } assertThat(iterator.next(), nullValue()); } private String createRandomDelimiter(String[] tokens) { String delimiter = ""; boolean isTokenOrWhitespace = true; while(isTokenOrWhitespace) { isTokenOrWhitespace = false; delimiter = randomUnicodeOfLength(1); for(String token:tokens) { if(token.contains(delimiter)) { isTokenOrWhitespace = true; } } if(Character.isWhitespace(delimiter.charAt(0))) { isTokenOrWhitespace = true; } } return delimiter; } private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) { String resultString = ""; ObjectIntHashMap<String> payloadCounter = new ObjectIntHashMap<>(); for (String token : tokens) { if (!payloadCounter.containsKey(token)) { payloadCounter.putIfAbsent(token, 0); } else { payloadCounter.put(token, payloadCounter.get(token) + 1); } resultString = resultString + token; BytesRef payload = payloads.get(token).get(payloadCounter.get(token)); if (payload.length > 0) { resultString = resultString + delimiter; switch (encoding) { case 0: { resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset)); break; } case 1: { resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset)); break; } case 2: { resultString = resultString + payload.utf8ToString(); break; } default: { throw new ElasticsearchException("unsupported encoding type"); } } } resultString = resultString + " "; } return resultString; } private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) { Map<String, List<BytesRef>> payloads = new HashMap<>(); for (String token : tokens) { if (payloads.get(token) == null) { payloads.put(token, new ArrayList<BytesRef>()); } boolean createPayload = randomBoolean(); if (createPayload) { switch (encoding) { case 0: { float theFloat = randomFloat(); payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat))); break; } case 1: { payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt()))); break; } case 2: { String payload = randomUnicodeOfLengthBetween(50, 100); for (int c = 0; c < payload.length(); c++) { if (Character.isWhitespace(payload.charAt(c))) { payload = payload.replace(payload.charAt(c), 'w'); } } payloads.get(token).add(new BytesRef(payload)); break; } default: { throw new ElasticsearchException("unsupported encoding type"); } } } else { payloads.get(token).add(new BytesRef()); } } return payloads; } private String[] crateRandomTokens() { String[] tokens = { "the", "quick", "brown", "fox" }; int numTokensWithDuplicates = randomIntBetween(3, 15); String[] finalTokens = new String[numTokensWithDuplicates]; for (int i = 0; i < numTokensWithDuplicates; i++) { finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)]; } return finalTokens; } // like testSimpleTermVectors but we create fields with no term vectors public void testSimpleTermVectorsWithGenerate() throws IOException { String[] fieldNames = new String[10]; for (int i = 0; i < fieldNames.length; i++) { fieldNames[i] = "field" + String.valueOf(i); } XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties"); XContentBuilder source = jsonBuilder().startObject(); for (String field : fieldNames) { mapping.startObject(field) .field("type", "text") .field("term_vector", randomBoolean() ? "with_positions_offsets_payloads" : "no") .field("analyzer", "tv_test") .endObject(); source.field(field, "the quick brown fox jumps over the lazy dog"); } mapping.endObject().endObject().endObject(); source.endObject(); assertAcked(prepareCreate("test") .addMapping("type1", mapping) .setSettings(Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); ensureGreen(); for (int i = 0; i < 10; i++) { client().prepareIndex("test", "type1", Integer.toString(i)) .setSource(source) .execute().actionGet(); refresh(); } for (int i = 0; i < 10; i++) { TermVectorsResponse response = client().prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true) .setOffsets(true) .setPositions(true) .setSelectedFields(fieldNames) .execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(fieldNames.length)); for (String fieldName : fieldNames) { // MemoryIndex does not support payloads checkBrownFoxTermVector(fields, fieldName, false); } } } private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException { String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; Terms terms = fields.terms(fieldName); assertThat(terms.size(), equalTo(8L)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, notNullValue()); // do not test ttf or doc frequency, because here we have many // shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); if (withPayloads) { assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } } assertThat(iterator.next(), nullValue()); } public void testDuelWithAndWithoutTermVectors() throws IOException, ExecutionException, InterruptedException { // setup indices String[] indexNames = new String[] {"with_tv", "without_tv"}; assertAcked(prepareCreate(indexNames[0]) .addMapping("type1", "field1", "type=text,term_vector=with_positions_offsets,analyzer=keyword")); assertAcked(prepareCreate(indexNames[1]) .addMapping("type1", "field1", "type=text,term_vector=no,analyzer=keyword")); ensureGreen(); // index documents with and without term vectors String[] content = new String[]{ "Generating a random permutation of a sequence (such as when shuffling cards).", "Selecting a random sample of a population (important in statistical sampling).", "Allocating experimental units via random assignment to a treatment or control condition.", "Generating random numbers: see Random number generation.", "Selecting a random sample of a population (important in statistical sampling).", "Allocating experimental units via random assignment to a treatment or control condition.", "Transforming a data stream (such as when using a scrambler in telecommunications)."}; List<IndexRequestBuilder> indexBuilders = new ArrayList<>(); for (String indexName : indexNames) { for (int id = 0; id < content.length; id++) { indexBuilders.add(client().prepareIndex() .setIndex(indexName) .setType("type1") .setId(String.valueOf(id)) .setSource("field1", content[id])); } } indexRandom(true, indexBuilders); // request tvs and compare from each index for (int id = 0; id < content.length; id++) { Fields[] fields = new Fields[2]; for (int j = 0; j < indexNames.length; j++) { TermVectorsResponse resp = client().prepareTermVector(indexNames[j], "type1", String.valueOf(id)) .setOffsets(true) .setPositions(true) .setSelectedFields("field1") .get(); assertThat("doc with index: " + indexNames[j] + ", type1 and id: " + id, resp.isExists(), equalTo(true)); fields[j] = resp.getFields(); } compareTermVectors("field1", fields[0], fields[1]); } } private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException { Terms terms0 = fields0.terms(fieldName); Terms terms1 = fields1.terms(fieldName); assertThat(terms0, notNullValue()); assertThat(terms1, notNullValue()); assertThat(terms0.size(), equalTo(terms1.size())); TermsEnum iter0 = terms0.iterator(); TermsEnum iter1 = terms1.iterator(); for (int i = 0; i < terms0.size(); i++) { BytesRef next0 = iter0.next(); assertThat(next0, notNullValue()); BytesRef next1 = iter1.next(); assertThat(next1, notNullValue()); // compare field value String string0 = next0.utf8ToString(); String string1 = next1.utf8ToString(); assertThat("expected: " + string0, string0, equalTo(string1)); // compare df and ttf assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq())); assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq())); // compare freq and docs PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL); PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL); assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc())); assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq())); // compare position, start offsets and end offsets for (int j = 0; j < docsAndPositions0.freq(); j++) { assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition())); assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset())); assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset())); } } assertThat(iter0.next(), nullValue()); assertThat(iter1.next(), nullValue()); } public void testSimpleWildCards() throws IOException { int numFields = 25; XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties"); XContentBuilder source = jsonBuilder().startObject(); for (int i = 0; i < numFields; i++) { mapping.startObject("field" + i) .field("type", "text") .field("term_vector", randomBoolean() ? "yes" : "no") .endObject(); source.field("field" + i, "some text here"); } source.endObject(); mapping.endObject().endObject().endObject(); assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping)); ensureGreen(); client().prepareIndex("test", "type1", "0").setSource(source).get(); refresh(); TermVectorsResponse response = client().prepareTermVectors(indexOrAlias(), "type1", "0").setSelectedFields("field*").get(); assertThat("Doc doesn't exists but should", response.isExists(), equalTo(true)); assertThat(response.getIndex(), equalTo("test")); assertThat("All term vectors should have been generated", response.getFields().size(), equalTo(numFields)); } public void testArtificialVsExisting() throws ExecutionException, InterruptedException, IOException { // setup indices Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "standard"); assertAcked(prepareCreate("test") .setSettings(settings) .addMapping("type1", "field1", "type=text,term_vector=with_positions_offsets")); ensureGreen(); // index documents existing document String[] content = new String[]{ "Generating a random permutation of a sequence (such as when shuffling cards).", "Selecting a random sample of a population (important in statistical sampling).", "Allocating experimental units via random assignment to a treatment or control condition.", "Generating random numbers: see Random number generation."}; List<IndexRequestBuilder> indexBuilders = new ArrayList<>(); for (int i = 0; i < content.length; i++) { indexBuilders.add(client().prepareIndex() .setIndex("test") .setType("type1") .setId(String.valueOf(i)) .setSource("field1", content[i])); } indexRandom(true, indexBuilders); for (int i = 0; i < content.length; i++) { // request tvs from existing document TermVectorsResponse respExisting = client().prepareTermVectors("test", "type1", String.valueOf(i)) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .get(); assertThat("doc with index: test, type1 and id: existing", respExisting.isExists(), equalTo(true)); // request tvs from artificial document TermVectorsResponse respArtificial = client().prepareTermVectors() .setIndex("test") .setType("type1") .setRouting(String.valueOf(i)) // ensure we get the stats from the same shard as existing doc .setDoc(jsonBuilder() .startObject() .field("field1", content[i]) .endObject()) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .get(); assertThat("doc with index: test, type1 and id: " + String.valueOf(i), respArtificial.isExists(), equalTo(true)); // compare existing tvs with artificial compareTermVectors("field1", respExisting.getFields(), respArtificial.getFields()); } } public void testArtificialNoDoc() throws IOException { // setup indices Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "standard"); assertAcked(prepareCreate("test") .setSettings(settings) .addMapping("type1", "field1", "type=text")); ensureGreen(); // request tvs from artificial document String text = "the quick brown fox jumps over the lazy dog"; TermVectorsResponse resp = client().prepareTermVectors() .setIndex("test") .setType("type1") .setDoc(jsonBuilder() .startObject() .field("field1", text) .endObject()) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .get(); assertThat(resp.isExists(), equalTo(true)); checkBrownFoxTermVector(resp.getFields(), "field1", false); // Since the index is empty, all of artificial document's "term_statistics" should be 0/absent Terms terms = resp.getFields().terms("field1"); assertEquals("sumDocFreq should be 0 for a non-existing field!", 0, terms.getSumDocFreq()); assertEquals("sumTotalTermFreq should be 0 for a non-existing field!", 0, terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); // we're guaranteed to receive terms for that field while (termsEnum.next() != null) { String term = termsEnum.term().utf8ToString(); assertEquals("term [" + term + "] does not exist in the index; ttf should be 0!", 0, termsEnum.totalTermFreq()); } } public void testPerFieldAnalyzer() throws IOException { int numFields = 25; // setup mapping and document source Set<String> withTermVectors = new HashSet<>(); XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties"); XContentBuilder source = jsonBuilder().startObject(); for (int i = 0; i < numFields; i++) { String fieldName = "field" + i; if (randomBoolean()) { withTermVectors.add(fieldName); } mapping.startObject(fieldName) .field("type", "text") .field("term_vector", withTermVectors.contains(fieldName) ? "yes" : "no") .endObject(); source.field(fieldName, "some text here"); } source.endObject(); mapping.endObject().endObject().endObject(); // setup indices with mapping Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "standard"); assertAcked(prepareCreate("test") .addAlias(new Alias("alias")) .setSettings(settings) .addMapping("type1", mapping)); ensureGreen(); // index a single document with prepared source client().prepareIndex("test", "type1", "0").setSource(source).get(); refresh(); // create random per_field_analyzer and selected fields Map<String, String> perFieldAnalyzer = new HashMap<>(); Set<String> selectedFields = new HashSet<>(); for (int i = 0; i < numFields; i++) { if (randomBoolean()) { perFieldAnalyzer.put("field" + i, "keyword"); } if (randomBoolean()) { perFieldAnalyzer.put("non_existing" + i, "keyword"); } if (randomBoolean()) { selectedFields.add("field" + i); } if (randomBoolean()) { selectedFields.add("non_existing" + i); } } // selected fields not specified TermVectorsResponse response = client().prepareTermVectors(indexOrAlias(), "type1", "0") .setPerFieldAnalyzer(perFieldAnalyzer) .get(); // should return all fields that have terms vectors, some with overridden analyzer checkAnalyzedFields(response.getFields(), withTermVectors, perFieldAnalyzer); // selected fields specified including some not in the mapping response = client().prepareTermVectors(indexOrAlias(), "type1", "0") .setSelectedFields(selectedFields.toArray(Strings.EMPTY_ARRAY)) .setPerFieldAnalyzer(perFieldAnalyzer) .get(); // should return only the specified valid fields, with some with overridden analyzer checkAnalyzedFields(response.getFields(), selectedFields, perFieldAnalyzer); } private void checkAnalyzedFields(Fields fieldsObject, Set<String> fieldNames, Map<String, String> perFieldAnalyzer) throws IOException { Set<String> validFields = new HashSet<>(); for (String fieldName : fieldNames){ if (fieldName.startsWith("non_existing")) { assertThat("Non existing field\"" + fieldName + "\" should not be returned!", fieldsObject.terms(fieldName), nullValue()); continue; } Terms terms = fieldsObject.terms(fieldName); assertThat("Existing field " + fieldName + "should have been returned", terms, notNullValue()); // check overridden by keyword analyzer ... if (perFieldAnalyzer.containsKey(fieldName)) { TermsEnum iterator = terms.iterator(); assertThat("Analyzer for " + fieldName + " should have been overridden!", iterator.next().utf8ToString(), equalTo("some text here")); assertThat(iterator.next(), nullValue()); } validFields.add(fieldName); } // ensure no other fields are returned assertThat("More fields than expected are returned!", fieldsObject.size(), equalTo(validFields.size())); } private static String indexOrAlias() { return randomBoolean() ? "test" : "alias"; } public void testTermVectorsWithVersion() { assertAcked(prepareCreate("test").addAlias(new Alias("alias")) .setSettings(Settings.builder().put("index.refresh_interval", -1))); ensureGreen(); TermVectorsResponse response = client().prepareTermVectors("test", "type1", "1").get(); assertThat(response.isExists(), equalTo(false)); logger.info("--> index doc 1"); client().prepareIndex("test", "type1", "1").setSource("field1", "value1", "field2", "value2").get(); // From translog: // version 0 means ignore version, which is the default response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getVersion(), equalTo(1L)); response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(1).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getVersion(), equalTo(1L)); try { client().prepareGet(indexOrAlias(), "type1", "1").setVersion(2).get(); fail(); } catch (VersionConflictEngineException e) { //all good } // From Lucene index: refresh(); // version 0 means ignore version, which is the default response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).setRealtime(false).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getIndex(), equalTo("test")); assertThat(response.getVersion(), equalTo(1L)); response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(1).setRealtime(false).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getIndex(), equalTo("test")); assertThat(response.getVersion(), equalTo(1L)); try { client().prepareGet(indexOrAlias(), "type1", "1").setVersion(2).setRealtime(false).get(); fail(); } catch (VersionConflictEngineException e) { //all good } logger.info("--> index doc 1 again, so increasing the version"); client().prepareIndex("test", "type1", "1").setSource("field1", "value1", "field2", "value2").get(); // From translog: // version 0 means ignore version, which is the default response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getIndex(), equalTo("test")); assertThat(response.getVersion(), equalTo(2L)); try { client().prepareGet(indexOrAlias(), "type1", "1").setVersion(1).get(); fail(); } catch (VersionConflictEngineException e) { //all good } response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(2).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getIndex(), equalTo("test")); assertThat(response.getVersion(), equalTo(2L)); // From Lucene index: refresh(); // version 0 means ignore version, which is the default response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).setRealtime(false).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getIndex(), equalTo("test")); assertThat(response.getVersion(), equalTo(2L)); try { client().prepareGet(indexOrAlias(), "type1", "1").setVersion(1).setRealtime(false).get(); fail(); } catch (VersionConflictEngineException e) { //all good } response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(2).setRealtime(false).get(); assertThat(response.isExists(), equalTo(true)); assertThat(response.getId(), equalTo("1")); assertThat(response.getIndex(), equalTo("test")); assertThat(response.getVersion(), equalTo(2L)); } public void testFilterLength() throws ExecutionException, InterruptedException, IOException { logger.info("Setting up the index ..."); Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "keyword"); assertAcked(prepareCreate("test") .setSettings(settings) .addMapping("type1", "tags", "type=text")); int numTerms = scaledRandomIntBetween(10, 50); logger.info("Indexing one document with tags of increasing length ..."); List<String> tags = new ArrayList<>(); for (int i = 0; i < numTerms; i++) { String tag = "a"; for (int j = 0; j < i; j++) { tag += "a"; } tags.add(tag); } indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags)); logger.info("Checking best tags by longest to shortest size ..."); TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings(); filterSettings.maxNumTerms = numTerms; TermVectorsResponse response; for (int i = 0; i < numTerms; i++) { filterSettings.minWordLength = numTerms - i; response = client().prepareTermVectors("test", "type1", "1") .setSelectedFields("tags") .setFieldStatistics(true) .setTermStatistics(true) .setFilterSettings(filterSettings) .get(); checkBestTerms(response.getFields().terms("tags"), tags.subList((numTerms - i - 1), numTerms)); } } public void testFilterTermFreq() throws ExecutionException, InterruptedException, IOException { logger.info("Setting up the index ..."); Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "keyword"); assertAcked(prepareCreate("test") .setSettings(settings) .addMapping("type1", "tags", "type=text")); logger.info("Indexing one document with tags of increasing frequencies ..."); int numTerms = scaledRandomIntBetween(10, 50); List<String> tags = new ArrayList<>(); List<String> uniqueTags = new ArrayList<>(); String tag; for (int i = 0; i < numTerms; i++) { tag = "tag_" + i; tags.add(tag); for (int j = 0; j < i; j++) { tags.add(tag); } uniqueTags.add(tag); } indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags)); logger.info("Checking best tags by highest to lowest term freq ..."); TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings(); TermVectorsResponse response; for (int i = 0; i < numTerms; i++) { filterSettings.maxNumTerms = i + 1; response = client().prepareTermVectors("test", "type1", "1") .setSelectedFields("tags") .setFieldStatistics(true) .setTermStatistics(true) .setFilterSettings(filterSettings) .get(); checkBestTerms(response.getFields().terms("tags"), uniqueTags.subList((numTerms - i - 1), numTerms)); } } public void testFilterDocFreq() throws ExecutionException, InterruptedException, IOException { logger.info("Setting up the index ..."); Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "keyword") .put("index.number_of_shards", 1); // no dfs assertAcked(prepareCreate("test") .setSettings(settings) .addMapping("type1", "tags", "type=text")); int numDocs = scaledRandomIntBetween(10, 50); // as many terms as there are docs logger.info("Indexing {} documents with tags of increasing dfs ...", numDocs); List<IndexRequestBuilder> builders = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (int i = 0; i < numDocs; i++) { tags.add("tag_" + i); builders.add(client().prepareIndex("test", "type1", i + "").setSource("tags", tags)); } indexRandom(true, builders); logger.info("Checking best terms by highest to lowest idf ..."); TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings(); TermVectorsResponse response; for (int i = 0; i < numDocs; i++) { filterSettings.maxNumTerms = i + 1; response = client().prepareTermVectors("test", "type1", (numDocs - 1) + "") .setSelectedFields("tags") .setFieldStatistics(true) .setTermStatistics(true) .setFilterSettings(filterSettings) .get(); checkBestTerms(response.getFields().terms("tags"), tags.subList((numDocs - i - 1), numDocs)); } } public void testArtificialDocWithPreference() throws ExecutionException, InterruptedException, IOException { // setup indices Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "standard"); assertAcked(prepareCreate("test") .setSettings(settings) .addMapping("type1", "field1", "type=text,term_vector=with_positions_offsets")); ensureGreen(); // index document indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("field1", "random permutation")); // Get search shards ClusterSearchShardsResponse searchShardsResponse = client().admin().cluster().prepareSearchShards("test").get(); List<Integer> shardIds = Arrays.stream(searchShardsResponse.getGroups()).map(s -> s.getShardId().id()).collect(Collectors.toList()); // request termvectors of artificial document from each shard int sumTotalTermFreq = 0; int sumDocFreq = 0; for (Integer shardId : shardIds) { TermVectorsResponse tvResponse = client().prepareTermVectors() .setIndex("test") .setType("type1") .setPreference("_shards:" + shardId) .setDoc(jsonBuilder().startObject().field("field1", "random permutation").endObject()) .setFieldStatistics(true) .setTermStatistics(true) .get(); Fields fields = tvResponse.getFields(); Terms terms = fields.terms("field1"); assertNotNull(terms); TermsEnum termsEnum = terms.iterator(); while (termsEnum.next() != null) { sumTotalTermFreq += termsEnum.totalTermFreq(); sumDocFreq += termsEnum.docFreq(); } } assertEquals("expected to find term statistics in exactly one shard!", 2, sumTotalTermFreq); assertEquals("expected to find term statistics in exactly one shard!", 2, sumDocFreq); } private void checkBestTerms(Terms terms, List<String> expectedTerms) throws IOException { final TermsEnum termsEnum = terms.iterator(); List<String> bestTerms = new ArrayList<>(); BytesRef text; while((text = termsEnum.next()) != null) { bestTerms.add(text.utf8ToString()); } Collections.sort(expectedTerms); Collections.sort(bestTerms); assertArrayEquals(expectedTerms.toArray(), bestTerms.toArray()); } }