/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.action.termvectors;
import com.carrotsearch.hppc.ObjectIntHashMap;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse;
import org.elasticsearch.action.admin.indices.alias.Alias;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.lucene.uid.Versions;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.engine.VersionConflictEngineException;
import org.elasticsearch.index.mapper.FieldMapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
public void testNoSuchDoc() throws Exception {
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("field")
.field("type", "text")
.field("term_vector", "with_positions_offsets_payloads")
.endObject()
.endObject()
.endObject().endObject();
assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping));
client().prepareIndex("test", "type1", "666").setSource("field", "foo bar").execute().actionGet();
refresh();
for (int i = 0; i < 20; i++) {
ActionFuture<TermVectorsResponse> termVector = client().termVectors(new TermVectorsRequest(indexOrAlias(), "type1", "" + i));
TermVectorsResponse actionGet = termVector.actionGet();
assertThat(actionGet, notNullValue());
assertThat(actionGet.getIndex(), equalTo("test"));
assertThat(actionGet.isExists(), equalTo(false));
// check response is nevertheless serializable to json
actionGet.toXContent(jsonBuilder(), ToXContent.EMPTY_PARAMS);
}
}
public void testExistingFieldWithNoTermVectorsNoNPE() throws Exception {
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("existingfield")
.field("type", "text")
.field("term_vector", "with_positions_offsets_payloads")
.endObject()
.endObject()
.endObject().endObject();
assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping));
// when indexing a field that simply has a question mark, the term vectors will be null
client().prepareIndex("test", "type1", "0").setSource("existingfield", "?").execute().actionGet();
refresh();
ActionFuture<TermVectorsResponse> termVector = client().termVectors(new TermVectorsRequest(indexOrAlias(), "type1", "0")
.selectedFields(new String[]{"existingfield"}));
// lets see if the null term vectors are caught...
TermVectorsResponse actionGet = termVector.actionGet();
assertThat(actionGet, notNullValue());
assertThat(actionGet.isExists(), equalTo(true));
assertThat(actionGet.getIndex(), equalTo("test"));
assertThat(actionGet.getFields().terms("existingfield"), nullValue());
}
public void testExistingFieldButNotInDocNPE() throws Exception {
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("existingfield")
.field("type", "text")
.field("term_vector", "with_positions_offsets_payloads")
.endObject()
.endObject()
.endObject().endObject();
assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping));
// when indexing a field that simply has a question mark, the term vectors will be null
client().prepareIndex("test", "type1", "0").setSource("anotherexistingfield", 1).execute().actionGet();
refresh();
ActionFuture<TermVectorsResponse> termVectors = client().termVectors(new TermVectorsRequest(indexOrAlias(), "type1", "0")
.selectedFields(randomBoolean() ? new String[]{"existingfield"} : null)
.termStatistics(true)
.fieldStatistics(true));
// lets see if the null term vectors are caught...
TermVectorsResponse actionGet = termVectors.actionGet();
assertThat(actionGet, notNullValue());
assertThat(actionGet.isExists(), equalTo(true));
assertThat(actionGet.getIndex(), equalTo("test"));
assertThat(actionGet.getFields().terms("existingfield"), nullValue());
}
public void testNotIndexedField() throws Exception {
// must be of type string and indexed.
assertAcked(prepareCreate("test")
.addAlias(new Alias("alias"))
.addMapping("type1",
"field0", "type=integer,", // no tvs
"field1", "type=text,index=false", // no tvs
"field2", "type=text,index=false,store=true", // no tvs
"field3", "type=text,index=false,term_vector=yes", // no tvs
"field4", "type=keyword", // yes tvs
"field5", "type=text,index=true")); // yes tvs
List<IndexRequestBuilder> indexBuilders = new ArrayList<>();
for (int i = 0; i < 6; i++) {
indexBuilders.add(client().prepareIndex()
.setIndex("test")
.setType("type1")
.setId(String.valueOf(i))
.setSource("field" + i, i));
}
indexRandom(true, indexBuilders);
for (int i = 0; i < 4; i++) {
TermVectorsResponse resp = client().prepareTermVectors(indexOrAlias(), "type1", String.valueOf(i))
.setSelectedFields("field" + i)
.get();
assertThat(resp, notNullValue());
assertThat(resp.isExists(), equalTo(true));
assertThat(resp.getIndex(), equalTo("test"));
assertThat("field" + i + " :", resp.getFields().terms("field" + i), nullValue());
}
for (int i = 4; i < 6; i++) {
TermVectorsResponse resp = client().prepareTermVectors(indexOrAlias(), "type1", String.valueOf(i))
.setSelectedFields("field" + i).get();
assertThat(resp.getIndex(), equalTo("test"));
assertThat("field" + i + " :", resp.getFields().terms("field" + i), notNullValue());
}
}
public void testSimpleTermVectors() throws IOException {
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("field")
.field("type", "text")
.field("term_vector", "with_positions_offsets_payloads")
.field("analyzer", "tv_test")
.endObject()
.endObject()
.endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping)
.addAlias(new Alias("alias"))
.setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
// 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
// 31the34 35lazy39 40dog43
.endObject()).execute().actionGet();
refresh();
}
for (int i = 0; i < 10; i++) {
TermVectorsRequestBuilder resp = client().prepareTermVectors(indexOrAlias(), "type1", Integer.toString(i)).setPayloads(true)
.setOffsets(true).setPositions(true).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat(response.getIndex(), equalTo("test"));
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
checkBrownFoxTermVector(fields, "field", true);
}
}
public void testRandomSingleTermVectors() throws IOException {
FieldType ft = new FieldType();
int config = randomInt(6);
boolean storePositions = false;
boolean storeOffsets = false;
boolean storePayloads = false;
boolean storeTermVectors = false;
switch (config) {
case 0: {
// do nothing
break;
}
case 1: {
storeTermVectors = true;
break;
}
case 2: {
storeTermVectors = true;
storePositions = true;
break;
}
case 3: {
storeTermVectors = true;
storeOffsets = true;
break;
}
case 4: {
storeTermVectors = true;
storePositions = true;
storeOffsets = true;
break;
}
case 5: {
storeTermVectors = true;
storePositions = true;
storePayloads = true;
break;
}
case 6: {
storeTermVectors = true;
storePositions = true;
storeOffsets = true;
storePayloads = true;
break;
}
}
ft.setStoreTermVectors(storeTermVectors);
ft.setStoreTermVectorOffsets(storeOffsets);
ft.setStoreTermVectorPayloads(storePayloads);
ft.setStoreTermVectorPositions(storePositions);
String optionString = FieldMapper.termVectorOptionsToString(ft);
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("field")
.field("type", "text")
.field("term_vector", optionString)
.field("analyzer", "tv_test")
.endObject()
.endObject()
.endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping)
.setSettings(Settings.builder()
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
// 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
// 31the34 35lazy39 40dog43
.endObject()).execute().actionGet();
refresh();
}
String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
boolean isPayloadRequested = randomBoolean();
boolean isOffsetRequested = randomBoolean();
boolean isPositionsRequested = randomBoolean();
String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString);
for (int i = 0; i < 10; i++) {
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i))
.setPayloads(isPayloadRequested).setOffsets(isOffsetRequested).setPositions(isPositionsRequested).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat(infoString + "doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(ft.storeTermVectors() ? 1 : 0));
if (ft.storeTermVectors()) {
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8L));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(infoString, next, notNullValue());
assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
assertThat(infoString, next, notNullValue());
// do not test ttf or doc frequency, because here we have
// many shards and do not know how documents are distributed
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
// docs and pos only returns something if positions or
// payloads or offsets are stored / requestd Otherwise use
// DocsEnum?
assertThat(infoString, docsAndPositions.nextDoc(), equalTo(0));
assertThat(infoString, freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
if (isPositionsRequested && storePositions) {
assertThat(infoString, termPos.length, equalTo(freq[j]));
}
if (isOffsetRequested && storeOffsets) {
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
}
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
// only return something useful if requested and stored
if (isPositionsRequested && storePositions) {
assertThat(infoString + "positions for term: " + string, nextPosition, equalTo(termPos[k]));
} else {
assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
}
// only return something useful if requested and stored
if (isPayloadRequested && storePayloads) {
assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef(
"word")));
} else {
assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(null));
}
// only return something useful if requested and stored
if (isOffsetRequested && storeOffsets) {
assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(),
equalTo(termStartOffset[k]));
assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
} else {
assertThat(infoString + "startOffsets term: " + string, docsAndPositions.startOffset(), equalTo(-1));
assertThat(infoString + "endOffsets term: " + string, docsAndPositions.endOffset(), equalTo(-1));
}
}
}
assertThat(iterator.next(), nullValue());
}
}
}
private String createInfoString(boolean isPositionsRequested, boolean isOffsetRequested, boolean isPayloadRequested,
String optionString) {
String ret = "Store config: " + optionString + "\n" + "Requested: pos-"
+ (isPositionsRequested ? "yes" : "no") + ", offsets-" + (isOffsetRequested ? "yes" : "no") + ", payload- "
+ (isPayloadRequested ? "yes" : "no") + "\n";
return ret;
}
public void testDuelESLucene() throws Exception {
TestFieldSetting[] testFieldSettings = getFieldSettings();
createIndexBasedOnFieldSettings("test", "alias", testFieldSettings);
//we generate as many docs as many shards we have
TestDoc[] testDocs = generateTestDocs("test", testFieldSettings);
DirectoryReader directoryReader = indexDocsWithLucene(testDocs);
TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings);
for (TestConfig test : testConfigs) {
TermVectorsRequestBuilder request = getRequestForConfig(test);
if (test.expectedException != null) {
assertThrows(request, test.expectedException);
continue;
}
TermVectorsResponse response = request.get();
Fields luceneTermVectors = getTermVectorsFromLucene(directoryReader, test.doc);
validateResponse(response, luceneTermVectors, test);
}
}
public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException {
//create the test document
int encoding = randomIntBetween(0, 2);
String encodingString = "";
if (encoding == 0) {
encodingString = "float";
}
if (encoding == 1) {
encodingString = "int";
}
if (encoding == 2) {
encodingString = "identity";
}
String[] tokens = crateRandomTokens();
Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
String delimiter = createRandomDelimiter(tokens);
String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
//create the mapping
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads")
.field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(
Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter")
.put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter)
.put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString)
.put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));
client().prepareIndex("test", "type1", Integer.toString(1))
.setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
refresh();
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true)
.setPositions(true).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
TermsEnum iterator = terms.iterator();
while (iterator.next() != null) {
String term = iterator.term().utf8ToString();
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
List<BytesRef> curPayloads = payloads.get(term);
assertThat(term, curPayloads, notNullValue());
assertNotNull(docsAndPositions);
for (int k = 0; k < docsAndPositions.freq(); k++) {
docsAndPositions.nextPosition();
if (docsAndPositions.getPayload()!=null){
String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString();
assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
} else {
String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString();
assertThat(infoString, curPayloads.get(k).length, equalTo(0));
}
}
}
assertThat(iterator.next(), nullValue());
}
private String createRandomDelimiter(String[] tokens) {
String delimiter = "";
boolean isTokenOrWhitespace = true;
while(isTokenOrWhitespace) {
isTokenOrWhitespace = false;
delimiter = randomUnicodeOfLength(1);
for(String token:tokens) {
if(token.contains(delimiter)) {
isTokenOrWhitespace = true;
}
}
if(Character.isWhitespace(delimiter.charAt(0))) {
isTokenOrWhitespace = true;
}
}
return delimiter;
}
private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
String resultString = "";
ObjectIntHashMap<String> payloadCounter = new ObjectIntHashMap<>();
for (String token : tokens) {
if (!payloadCounter.containsKey(token)) {
payloadCounter.putIfAbsent(token, 0);
} else {
payloadCounter.put(token, payloadCounter.get(token) + 1);
}
resultString = resultString + token;
BytesRef payload = payloads.get(token).get(payloadCounter.get(token));
if (payload.length > 0) {
resultString = resultString + delimiter;
switch (encoding) {
case 0: {
resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset));
break;
}
case 1: {
resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset));
break;
}
case 2: {
resultString = resultString + payload.utf8ToString();
break;
}
default: {
throw new ElasticsearchException("unsupported encoding type");
}
}
}
resultString = resultString + " ";
}
return resultString;
}
private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) {
Map<String, List<BytesRef>> payloads = new HashMap<>();
for (String token : tokens) {
if (payloads.get(token) == null) {
payloads.put(token, new ArrayList<BytesRef>());
}
boolean createPayload = randomBoolean();
if (createPayload) {
switch (encoding) {
case 0: {
float theFloat = randomFloat();
payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat)));
break;
}
case 1: {
payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt())));
break;
}
case 2: {
String payload = randomUnicodeOfLengthBetween(50, 100);
for (int c = 0; c < payload.length(); c++) {
if (Character.isWhitespace(payload.charAt(c))) {
payload = payload.replace(payload.charAt(c), 'w');
}
}
payloads.get(token).add(new BytesRef(payload));
break;
}
default: {
throw new ElasticsearchException("unsupported encoding type");
}
}
} else {
payloads.get(token).add(new BytesRef());
}
}
return payloads;
}
private String[] crateRandomTokens() {
String[] tokens = { "the", "quick", "brown", "fox" };
int numTokensWithDuplicates = randomIntBetween(3, 15);
String[] finalTokens = new String[numTokensWithDuplicates];
for (int i = 0; i < numTokensWithDuplicates; i++) {
finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)];
}
return finalTokens;
}
// like testSimpleTermVectors but we create fields with no term vectors
public void testSimpleTermVectorsWithGenerate() throws IOException {
String[] fieldNames = new String[10];
for (int i = 0; i < fieldNames.length; i++) {
fieldNames[i] = "field" + String.valueOf(i);
}
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties");
XContentBuilder source = jsonBuilder().startObject();
for (String field : fieldNames) {
mapping.startObject(field)
.field("type", "text")
.field("term_vector", randomBoolean() ? "with_positions_offsets_payloads" : "no")
.field("analyzer", "tv_test")
.endObject();
source.field(field, "the quick brown fox jumps over the lazy dog");
}
mapping.endObject().endObject().endObject();
source.endObject();
assertAcked(prepareCreate("test")
.addMapping("type1", mapping)
.setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
ensureGreen();
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(source)
.execute().actionGet();
refresh();
}
for (int i = 0; i < 10; i++) {
TermVectorsResponse response = client().prepareTermVectors("test", "type1", Integer.toString(i))
.setPayloads(true)
.setOffsets(true)
.setPositions(true)
.setSelectedFields(fieldNames)
.execute().actionGet();
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(fieldNames.length));
for (String fieldName : fieldNames) {
// MemoryIndex does not support payloads
checkBrownFoxTermVector(fields, fieldName, false);
}
}
}
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException {
String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
Terms terms = fields.terms(fieldName);
assertThat(terms.size(), equalTo(8L));
TermsEnum iterator = terms.iterator();
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, notNullValue());
// do not test ttf or doc frequency, because here we have many
// shards and do not know how documents are distributed
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
if (withPayloads) {
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
}
assertThat(iterator.next(), nullValue());
}
public void testDuelWithAndWithoutTermVectors() throws IOException, ExecutionException, InterruptedException {
// setup indices
String[] indexNames = new String[] {"with_tv", "without_tv"};
assertAcked(prepareCreate(indexNames[0])
.addMapping("type1", "field1", "type=text,term_vector=with_positions_offsets,analyzer=keyword"));
assertAcked(prepareCreate(indexNames[1])
.addMapping("type1", "field1", "type=text,term_vector=no,analyzer=keyword"));
ensureGreen();
// index documents with and without term vectors
String[] content = new String[]{
"Generating a random permutation of a sequence (such as when shuffling cards).",
"Selecting a random sample of a population (important in statistical sampling).",
"Allocating experimental units via random assignment to a treatment or control condition.",
"Generating random numbers: see Random number generation.",
"Selecting a random sample of a population (important in statistical sampling).",
"Allocating experimental units via random assignment to a treatment or control condition.",
"Transforming a data stream (such as when using a scrambler in telecommunications)."};
List<IndexRequestBuilder> indexBuilders = new ArrayList<>();
for (String indexName : indexNames) {
for (int id = 0; id < content.length; id++) {
indexBuilders.add(client().prepareIndex()
.setIndex(indexName)
.setType("type1")
.setId(String.valueOf(id))
.setSource("field1", content[id]));
}
}
indexRandom(true, indexBuilders);
// request tvs and compare from each index
for (int id = 0; id < content.length; id++) {
Fields[] fields = new Fields[2];
for (int j = 0; j < indexNames.length; j++) {
TermVectorsResponse resp = client().prepareTermVector(indexNames[j], "type1", String.valueOf(id))
.setOffsets(true)
.setPositions(true)
.setSelectedFields("field1")
.get();
assertThat("doc with index: " + indexNames[j] + ", type1 and id: " + id, resp.isExists(), equalTo(true));
fields[j] = resp.getFields();
}
compareTermVectors("field1", fields[0], fields[1]);
}
}
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
Terms terms0 = fields0.terms(fieldName);
Terms terms1 = fields1.terms(fieldName);
assertThat(terms0, notNullValue());
assertThat(terms1, notNullValue());
assertThat(terms0.size(), equalTo(terms1.size()));
TermsEnum iter0 = terms0.iterator();
TermsEnum iter1 = terms1.iterator();
for (int i = 0; i < terms0.size(); i++) {
BytesRef next0 = iter0.next();
assertThat(next0, notNullValue());
BytesRef next1 = iter1.next();
assertThat(next1, notNullValue());
// compare field value
String string0 = next0.utf8ToString();
String string1 = next1.utf8ToString();
assertThat("expected: " + string0, string0, equalTo(string1));
// compare df and ttf
assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));
// compare freq and docs
PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));
// compare position, start offsets and end offsets
for (int j = 0; j < docsAndPositions0.freq(); j++) {
assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
}
}
assertThat(iter0.next(), nullValue());
assertThat(iter1.next(), nullValue());
}
public void testSimpleWildCards() throws IOException {
int numFields = 25;
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties");
XContentBuilder source = jsonBuilder().startObject();
for (int i = 0; i < numFields; i++) {
mapping.startObject("field" + i)
.field("type", "text")
.field("term_vector", randomBoolean() ? "yes" : "no")
.endObject();
source.field("field" + i, "some text here");
}
source.endObject();
mapping.endObject().endObject().endObject();
assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("type1", mapping));
ensureGreen();
client().prepareIndex("test", "type1", "0").setSource(source).get();
refresh();
TermVectorsResponse response = client().prepareTermVectors(indexOrAlias(), "type1", "0").setSelectedFields("field*").get();
assertThat("Doc doesn't exists but should", response.isExists(), equalTo(true));
assertThat(response.getIndex(), equalTo("test"));
assertThat("All term vectors should have been generated", response.getFields().size(), equalTo(numFields));
}
public void testArtificialVsExisting() throws ExecutionException, InterruptedException, IOException {
// setup indices
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "standard");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "field1", "type=text,term_vector=with_positions_offsets"));
ensureGreen();
// index documents existing document
String[] content = new String[]{
"Generating a random permutation of a sequence (such as when shuffling cards).",
"Selecting a random sample of a population (important in statistical sampling).",
"Allocating experimental units via random assignment to a treatment or control condition.",
"Generating random numbers: see Random number generation."};
List<IndexRequestBuilder> indexBuilders = new ArrayList<>();
for (int i = 0; i < content.length; i++) {
indexBuilders.add(client().prepareIndex()
.setIndex("test")
.setType("type1")
.setId(String.valueOf(i))
.setSource("field1", content[i]));
}
indexRandom(true, indexBuilders);
for (int i = 0; i < content.length; i++) {
// request tvs from existing document
TermVectorsResponse respExisting = client().prepareTermVectors("test", "type1", String.valueOf(i))
.setOffsets(true)
.setPositions(true)
.setFieldStatistics(true)
.setTermStatistics(true)
.get();
assertThat("doc with index: test, type1 and id: existing", respExisting.isExists(), equalTo(true));
// request tvs from artificial document
TermVectorsResponse respArtificial = client().prepareTermVectors()
.setIndex("test")
.setType("type1")
.setRouting(String.valueOf(i)) // ensure we get the stats from the same shard as existing doc
.setDoc(jsonBuilder()
.startObject()
.field("field1", content[i])
.endObject())
.setOffsets(true)
.setPositions(true)
.setFieldStatistics(true)
.setTermStatistics(true)
.get();
assertThat("doc with index: test, type1 and id: " + String.valueOf(i), respArtificial.isExists(), equalTo(true));
// compare existing tvs with artificial
compareTermVectors("field1", respExisting.getFields(), respArtificial.getFields());
}
}
public void testArtificialNoDoc() throws IOException {
// setup indices
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "standard");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "field1", "type=text"));
ensureGreen();
// request tvs from artificial document
String text = "the quick brown fox jumps over the lazy dog";
TermVectorsResponse resp = client().prepareTermVectors()
.setIndex("test")
.setType("type1")
.setDoc(jsonBuilder()
.startObject()
.field("field1", text)
.endObject())
.setOffsets(true)
.setPositions(true)
.setFieldStatistics(true)
.setTermStatistics(true)
.get();
assertThat(resp.isExists(), equalTo(true));
checkBrownFoxTermVector(resp.getFields(), "field1", false);
// Since the index is empty, all of artificial document's "term_statistics" should be 0/absent
Terms terms = resp.getFields().terms("field1");
assertEquals("sumDocFreq should be 0 for a non-existing field!", 0, terms.getSumDocFreq());
assertEquals("sumTotalTermFreq should be 0 for a non-existing field!", 0, terms.getSumTotalTermFreq());
TermsEnum termsEnum = terms.iterator(); // we're guaranteed to receive terms for that field
while (termsEnum.next() != null) {
String term = termsEnum.term().utf8ToString();
assertEquals("term [" + term + "] does not exist in the index; ttf should be 0!", 0, termsEnum.totalTermFreq());
}
}
public void testPerFieldAnalyzer() throws IOException {
int numFields = 25;
// setup mapping and document source
Set<String> withTermVectors = new HashSet<>();
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties");
XContentBuilder source = jsonBuilder().startObject();
for (int i = 0; i < numFields; i++) {
String fieldName = "field" + i;
if (randomBoolean()) {
withTermVectors.add(fieldName);
}
mapping.startObject(fieldName)
.field("type", "text")
.field("term_vector", withTermVectors.contains(fieldName) ? "yes" : "no")
.endObject();
source.field(fieldName, "some text here");
}
source.endObject();
mapping.endObject().endObject().endObject();
// setup indices with mapping
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "standard");
assertAcked(prepareCreate("test")
.addAlias(new Alias("alias"))
.setSettings(settings)
.addMapping("type1", mapping));
ensureGreen();
// index a single document with prepared source
client().prepareIndex("test", "type1", "0").setSource(source).get();
refresh();
// create random per_field_analyzer and selected fields
Map<String, String> perFieldAnalyzer = new HashMap<>();
Set<String> selectedFields = new HashSet<>();
for (int i = 0; i < numFields; i++) {
if (randomBoolean()) {
perFieldAnalyzer.put("field" + i, "keyword");
}
if (randomBoolean()) {
perFieldAnalyzer.put("non_existing" + i, "keyword");
}
if (randomBoolean()) {
selectedFields.add("field" + i);
}
if (randomBoolean()) {
selectedFields.add("non_existing" + i);
}
}
// selected fields not specified
TermVectorsResponse response = client().prepareTermVectors(indexOrAlias(), "type1", "0")
.setPerFieldAnalyzer(perFieldAnalyzer)
.get();
// should return all fields that have terms vectors, some with overridden analyzer
checkAnalyzedFields(response.getFields(), withTermVectors, perFieldAnalyzer);
// selected fields specified including some not in the mapping
response = client().prepareTermVectors(indexOrAlias(), "type1", "0")
.setSelectedFields(selectedFields.toArray(Strings.EMPTY_ARRAY))
.setPerFieldAnalyzer(perFieldAnalyzer)
.get();
// should return only the specified valid fields, with some with overridden analyzer
checkAnalyzedFields(response.getFields(), selectedFields, perFieldAnalyzer);
}
private void checkAnalyzedFields(Fields fieldsObject, Set<String> fieldNames, Map<String, String> perFieldAnalyzer) throws IOException {
Set<String> validFields = new HashSet<>();
for (String fieldName : fieldNames){
if (fieldName.startsWith("non_existing")) {
assertThat("Non existing field\"" + fieldName + "\" should not be returned!", fieldsObject.terms(fieldName), nullValue());
continue;
}
Terms terms = fieldsObject.terms(fieldName);
assertThat("Existing field " + fieldName + "should have been returned", terms, notNullValue());
// check overridden by keyword analyzer ...
if (perFieldAnalyzer.containsKey(fieldName)) {
TermsEnum iterator = terms.iterator();
assertThat("Analyzer for " + fieldName + " should have been overridden!", iterator.next().utf8ToString(), equalTo("some text here"));
assertThat(iterator.next(), nullValue());
}
validFields.add(fieldName);
}
// ensure no other fields are returned
assertThat("More fields than expected are returned!", fieldsObject.size(), equalTo(validFields.size()));
}
private static String indexOrAlias() {
return randomBoolean() ? "test" : "alias";
}
public void testTermVectorsWithVersion() {
assertAcked(prepareCreate("test").addAlias(new Alias("alias"))
.setSettings(Settings.builder().put("index.refresh_interval", -1)));
ensureGreen();
TermVectorsResponse response = client().prepareTermVectors("test", "type1", "1").get();
assertThat(response.isExists(), equalTo(false));
logger.info("--> index doc 1");
client().prepareIndex("test", "type1", "1").setSource("field1", "value1", "field2", "value2").get();
// From translog:
// version 0 means ignore version, which is the default
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getVersion(), equalTo(1L));
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(1).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getVersion(), equalTo(1L));
try {
client().prepareGet(indexOrAlias(), "type1", "1").setVersion(2).get();
fail();
} catch (VersionConflictEngineException e) {
//all good
}
// From Lucene index:
refresh();
// version 0 means ignore version, which is the default
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).setRealtime(false).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getIndex(), equalTo("test"));
assertThat(response.getVersion(), equalTo(1L));
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(1).setRealtime(false).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getIndex(), equalTo("test"));
assertThat(response.getVersion(), equalTo(1L));
try {
client().prepareGet(indexOrAlias(), "type1", "1").setVersion(2).setRealtime(false).get();
fail();
} catch (VersionConflictEngineException e) {
//all good
}
logger.info("--> index doc 1 again, so increasing the version");
client().prepareIndex("test", "type1", "1").setSource("field1", "value1", "field2", "value2").get();
// From translog:
// version 0 means ignore version, which is the default
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getIndex(), equalTo("test"));
assertThat(response.getVersion(), equalTo(2L));
try {
client().prepareGet(indexOrAlias(), "type1", "1").setVersion(1).get();
fail();
} catch (VersionConflictEngineException e) {
//all good
}
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(2).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getIndex(), equalTo("test"));
assertThat(response.getVersion(), equalTo(2L));
// From Lucene index:
refresh();
// version 0 means ignore version, which is the default
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(Versions.MATCH_ANY).setRealtime(false).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getIndex(), equalTo("test"));
assertThat(response.getVersion(), equalTo(2L));
try {
client().prepareGet(indexOrAlias(), "type1", "1").setVersion(1).setRealtime(false).get();
fail();
} catch (VersionConflictEngineException e) {
//all good
}
response = client().prepareTermVectors(indexOrAlias(), "type1", "1").setVersion(2).setRealtime(false).get();
assertThat(response.isExists(), equalTo(true));
assertThat(response.getId(), equalTo("1"));
assertThat(response.getIndex(), equalTo("test"));
assertThat(response.getVersion(), equalTo(2L));
}
public void testFilterLength() throws ExecutionException, InterruptedException, IOException {
logger.info("Setting up the index ...");
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "keyword");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "tags", "type=text"));
int numTerms = scaledRandomIntBetween(10, 50);
logger.info("Indexing one document with tags of increasing length ...");
List<String> tags = new ArrayList<>();
for (int i = 0; i < numTerms; i++) {
String tag = "a";
for (int j = 0; j < i; j++) {
tag += "a";
}
tags.add(tag);
}
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
logger.info("Checking best tags by longest to shortest size ...");
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
filterSettings.maxNumTerms = numTerms;
TermVectorsResponse response;
for (int i = 0; i < numTerms; i++) {
filterSettings.minWordLength = numTerms - i;
response = client().prepareTermVectors("test", "type1", "1")
.setSelectedFields("tags")
.setFieldStatistics(true)
.setTermStatistics(true)
.setFilterSettings(filterSettings)
.get();
checkBestTerms(response.getFields().terms("tags"), tags.subList((numTerms - i - 1), numTerms));
}
}
public void testFilterTermFreq() throws ExecutionException, InterruptedException, IOException {
logger.info("Setting up the index ...");
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "keyword");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "tags", "type=text"));
logger.info("Indexing one document with tags of increasing frequencies ...");
int numTerms = scaledRandomIntBetween(10, 50);
List<String> tags = new ArrayList<>();
List<String> uniqueTags = new ArrayList<>();
String tag;
for (int i = 0; i < numTerms; i++) {
tag = "tag_" + i;
tags.add(tag);
for (int j = 0; j < i; j++) {
tags.add(tag);
}
uniqueTags.add(tag);
}
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
logger.info("Checking best tags by highest to lowest term freq ...");
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
TermVectorsResponse response;
for (int i = 0; i < numTerms; i++) {
filterSettings.maxNumTerms = i + 1;
response = client().prepareTermVectors("test", "type1", "1")
.setSelectedFields("tags")
.setFieldStatistics(true)
.setTermStatistics(true)
.setFilterSettings(filterSettings)
.get();
checkBestTerms(response.getFields().terms("tags"), uniqueTags.subList((numTerms - i - 1), numTerms));
}
}
public void testFilterDocFreq() throws ExecutionException, InterruptedException, IOException {
logger.info("Setting up the index ...");
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "keyword")
.put("index.number_of_shards", 1); // no dfs
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "tags", "type=text"));
int numDocs = scaledRandomIntBetween(10, 50); // as many terms as there are docs
logger.info("Indexing {} documents with tags of increasing dfs ...", numDocs);
List<IndexRequestBuilder> builders = new ArrayList<>();
List<String> tags = new ArrayList<>();
for (int i = 0; i < numDocs; i++) {
tags.add("tag_" + i);
builders.add(client().prepareIndex("test", "type1", i + "").setSource("tags", tags));
}
indexRandom(true, builders);
logger.info("Checking best terms by highest to lowest idf ...");
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
TermVectorsResponse response;
for (int i = 0; i < numDocs; i++) {
filterSettings.maxNumTerms = i + 1;
response = client().prepareTermVectors("test", "type1", (numDocs - 1) + "")
.setSelectedFields("tags")
.setFieldStatistics(true)
.setTermStatistics(true)
.setFilterSettings(filterSettings)
.get();
checkBestTerms(response.getFields().terms("tags"), tags.subList((numDocs - i - 1), numDocs));
}
}
public void testArtificialDocWithPreference() throws ExecutionException, InterruptedException, IOException {
// setup indices
Settings.Builder settings = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer", "standard");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "field1", "type=text,term_vector=with_positions_offsets"));
ensureGreen();
// index document
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("field1", "random permutation"));
// Get search shards
ClusterSearchShardsResponse searchShardsResponse = client().admin().cluster().prepareSearchShards("test").get();
List<Integer> shardIds = Arrays.stream(searchShardsResponse.getGroups()).map(s -> s.getShardId().id()).collect(Collectors.toList());
// request termvectors of artificial document from each shard
int sumTotalTermFreq = 0;
int sumDocFreq = 0;
for (Integer shardId : shardIds) {
TermVectorsResponse tvResponse = client().prepareTermVectors()
.setIndex("test")
.setType("type1")
.setPreference("_shards:" + shardId)
.setDoc(jsonBuilder().startObject().field("field1", "random permutation").endObject())
.setFieldStatistics(true)
.setTermStatistics(true)
.get();
Fields fields = tvResponse.getFields();
Terms terms = fields.terms("field1");
assertNotNull(terms);
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
sumTotalTermFreq += termsEnum.totalTermFreq();
sumDocFreq += termsEnum.docFreq();
}
}
assertEquals("expected to find term statistics in exactly one shard!", 2, sumTotalTermFreq);
assertEquals("expected to find term statistics in exactly one shard!", 2, sumDocFreq);
}
private void checkBestTerms(Terms terms, List<String> expectedTerms) throws IOException {
final TermsEnum termsEnum = terms.iterator();
List<String> bestTerms = new ArrayList<>();
BytesRef text;
while((text = termsEnum.next()) != null) {
bestTerms.add(text.utf8ToString());
}
Collections.sort(expectedTerms);
Collections.sort(bestTerms);
assertArrayEquals(expectedTerms.toArray(), bestTerms.toArray());
}
}