/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.action.termvectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.elasticsearch.action.admin.indices.alias.Alias; import org.elasticsearch.common.inject.internal.Join; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.test.ESIntegTestCase; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Locale; import java.util.Map; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.hamcrest.Matchers.equalTo; public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase { protected static class TestFieldSetting { public final String name; public final boolean storedOffset; public final boolean storedPayloads; public final boolean storedPositions; public TestFieldSetting(String name, boolean storedOffset, boolean storedPayloads, boolean storedPositions) { this.name = name; this.storedOffset = storedOffset; this.storedPayloads = storedPayloads; this.storedPositions = storedPositions; } public void addToMappings(XContentBuilder mappingsBuilder) throws IOException { mappingsBuilder.startObject(name); mappingsBuilder.field("type", "text"); String tv_settings; if (storedPositions && storedOffset && storedPayloads) { tv_settings = "with_positions_offsets_payloads"; } else if (storedPositions && storedOffset) { tv_settings = "with_positions_offsets"; } else if (storedPayloads) { tv_settings = "with_positions_payloads"; } else if (storedPositions) { tv_settings = "with_positions"; } else if (storedOffset) { tv_settings = "with_offsets"; } else { tv_settings = "yes"; } mappingsBuilder.field("term_vector", tv_settings); if (storedPayloads) { mappingsBuilder.field("analyzer", "tv_test"); } mappingsBuilder.endObject(); } @Override public String toString() { StringBuilder sb = new StringBuilder("name: ").append(name).append(" tv_with:"); if (storedPayloads) { sb.append("payloads,"); } if (storedOffset) { sb.append("offsets,"); } if (storedPositions) { sb.append("positions,"); } return sb.toString(); } } protected static class TestDoc { public final String id; public final TestFieldSetting[] fieldSettings; public final String[] fieldContent; public String index = "test"; public String alias = "alias"; public String type = "type1"; public TestDoc(String id, TestFieldSetting[] fieldSettings, String[] fieldContent) { this.id = id; assertEquals(fieldSettings.length, fieldContent.length); this.fieldSettings = fieldSettings; this.fieldContent = fieldContent; } public TestDoc index(String index) { this.index = index; return this; } public TestDoc alias(String alias) { this.alias = alias; return this; } @Override public String toString() { StringBuilder sb = new StringBuilder("index:").append(index).append(" type:").append(type).append(" id:").append(id); for (int i = 0; i < fieldSettings.length; i++) { TestFieldSetting f = fieldSettings[i]; sb.append("\n").append("Field: ").append(f).append("\n content:").append(fieldContent[i]); } sb.append("\n"); return sb.toString(); } } protected static class TestConfig { public final TestDoc doc; public final String[] selectedFields; public final boolean requestPositions; public final boolean requestOffsets; public final boolean requestPayloads; public Class expectedException = null; public TestConfig(TestDoc doc, String[] selectedFields, boolean requestPositions, boolean requestOffsets, boolean requestPayloads) { this.doc = doc; this.selectedFields = selectedFields; this.requestPositions = requestPositions; this.requestOffsets = requestOffsets; this.requestPayloads = requestPayloads; } public TestConfig expectedException(Class exceptionClass) { this.expectedException = exceptionClass; return this; } @Override public String toString() { String requested = ""; if (requestOffsets) { requested += "offsets,"; } if (requestPositions) { requested += "position,"; } if (requestPayloads) { requested += "payload,"; } Locale aLocale = new Locale("en", "US"); return String.format(aLocale, "(doc: %s\n requested: %s, fields: %s)", doc, requested, selectedFields == null ? "NULL" : Join.join(",", selectedFields)); } } protected void createIndexBasedOnFieldSettings(String index, String alias, TestFieldSetting[] fieldSettings) throws IOException { XContentBuilder mappingBuilder = jsonBuilder(); mappingBuilder.startObject().startObject("type1").startObject("properties"); for (TestFieldSetting field : fieldSettings) { field.addToMappings(mappingBuilder); } mappingBuilder.endObject().endObject().endObject(); Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer.tv_test.tokenizer", "standard") .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"); assertAcked(prepareCreate(index).addMapping("type1", mappingBuilder).setSettings(settings).addAlias(new Alias(alias))); } /** * Generate test documentsThe returned documents are already indexed. */ protected TestDoc[] generateTestDocs(String index, TestFieldSetting[] fieldSettings) { String[] fieldContentOptions = new String[]{"Generating a random permutation of a sequence (such as when shuffling cards).", "Selecting a random sample of a population (important in statistical sampling).", "Allocating experimental units via random assignment to a treatment or control condition.", "Generating random numbers: see Random number generation.", "Transforming a data stream (such as when using a scrambler in telecommunications)."}; String[] contentArray = new String[fieldSettings.length]; Map<String, Object> docSource = new HashMap<>(); int totalShards = getNumShards(index).numPrimaries; TestDoc[] testDocs = new TestDoc[totalShards]; // this methods wants to send one doc to each shard for (int i = 0; i < totalShards; i++) { docSource.clear(); for (int j = 0; j < contentArray.length; j++) { contentArray[j] = fieldContentOptions[randomInt(fieldContentOptions.length - 1)]; docSource.put(fieldSettings[j].name, contentArray[j]); } final String id = routingKeyForShard(index, i); TestDoc doc = new TestDoc(id, fieldSettings, contentArray.clone()); index(doc.index, doc.type, doc.id, docSource); testDocs[i] = doc; } refresh(); return testDocs; } protected TestConfig[] generateTestConfigs(int numberOfTests, TestDoc[] testDocs, TestFieldSetting[] fieldSettings) { ArrayList<TestConfig> configs = new ArrayList<>(); for (int i = 0; i < numberOfTests; i++) { ArrayList<String> selectedFields = null; if (randomBoolean()) { // used field selection selectedFields = new ArrayList<>(); if (randomBoolean()) { selectedFields.add("Doesnt_exist"); // this will be ignored. } for (TestFieldSetting field : fieldSettings) if (randomBoolean()) { selectedFields.add(field.name); } if (selectedFields.size() == 0) { selectedFields = null; // 0 length set is not supported. } } TestConfig config = new TestConfig(testDocs[randomInt(testDocs.length - 1)], selectedFields == null ? null : selectedFields.toArray(new String[]{}), randomBoolean(), randomBoolean(), randomBoolean()); configs.add(config); } // always adds a test that fails configs.add(new TestConfig(new TestDoc("doesnt_exist", new TestFieldSetting[]{}, new String[]{}).index("doesn't_exist").alias("doesn't_exist"), new String[]{"doesnt_exist"}, true, true, true).expectedException(org.elasticsearch.index.IndexNotFoundException.class)); refresh(); return configs.toArray(new TestConfig[configs.size()]); } protected TestFieldSetting[] getFieldSettings() { return new TestFieldSetting[]{new TestFieldSetting("field_with_positions", false, false, true), new TestFieldSetting("field_with_offsets", true, false, false), new TestFieldSetting("field_with_only_tv", false, false, false), new TestFieldSetting("field_with_positions_offsets", false, false, true), new TestFieldSetting("field_with_positions_payloads", false, true, true) }; } protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException { Map<String, Analyzer> mapping = new HashMap<>(); for (TestFieldSetting field : testDocs[0].fieldSettings) { if (field.storedPayloads) { mapping.put(field.name, new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(); TokenFilter filter = new LowerCaseFilter(tokenizer); filter = new TypeAsPayloadTokenFilter(filter); return new TokenStreamComponents(tokenizer, filter); } }); } } PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), mapping); Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(wrapper); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, conf); for (TestDoc doc : testDocs) { Document d = new Document(); d.add(new Field("id", doc.id, StringField.TYPE_STORED)); for (int i = 0; i < doc.fieldContent.length; i++) { FieldType type = new FieldType(TextField.TYPE_STORED); TestFieldSetting fieldSetting = doc.fieldSettings[i]; type.setStoreTermVectorOffsets(fieldSetting.storedOffset); type.setStoreTermVectorPayloads(fieldSetting.storedPayloads); type.setStoreTermVectorPositions(fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset); type.setStoreTermVectors(true); type.freeze(); d.add(new Field(fieldSetting.name, doc.fieldContent[i], type)); } writer.updateDocument(new Term("id", doc.id), d); writer.commit(); } writer.close(); return DirectoryReader.open(dir); } protected void validateResponse(TermVectorsResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException { assertThat(esResponse.getIndex(), equalTo(testConfig.doc.index)); TestDoc testDoc = testConfig.doc; HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<>( Arrays.asList(testConfig.selectedFields)); Fields esTermVectorFields = esResponse.getFields(); for (TestFieldSetting field : testDoc.fieldSettings) { Terms esTerms = esTermVectorFields.terms(field.name); if (selectedFields != null && !selectedFields.contains(field.name)) { assertNull(esTerms); continue; } assertNotNull(esTerms); Terms luceneTerms = luceneFields.terms(field.name); TermsEnum esTermEnum = esTerms.iterator(); TermsEnum luceneTermEnum = luceneTerms.iterator(); while (esTermEnum.next() != null) { assertNotNull(luceneTermEnum.next()); assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq())); PostingsEnum esDocsPosEnum = esTermEnum.postings(null, PostingsEnum.POSITIONS); PostingsEnum luceneDocsPosEnum = luceneTermEnum.postings(null, PostingsEnum.POSITIONS); if (luceneDocsPosEnum == null) { // test we expect that... assertFalse(field.storedOffset); assertFalse(field.storedPayloads); assertFalse(field.storedPositions); continue; } String currentTerm = esTermEnum.term().utf8ToString(); assertThat("Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString())); esDocsPosEnum.nextDoc(); luceneDocsPosEnum.nextDoc(); int freq = esDocsPosEnum.freq(); assertThat(freq, equalTo(luceneDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field.name + " term:" + currentTerm + ")"; int lucenePos = luceneDocsPosEnum.nextPosition(); int esPos = esDocsPosEnum.nextPosition(); if (field.storedPositions && testConfig.requestPositions) { assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos)); } else { assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1)); } if (field.storedOffset && testConfig.requestOffsets) { assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset())); assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset())); } else { assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1)); assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1)); } if (field.storedPayloads && testConfig.requestPayloads) { assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload())); } else { assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null)); } } } assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next()); } } protected TermVectorsRequestBuilder getRequestForConfig(TestConfig config) { return client().prepareTermVectors(randomBoolean() ? config.doc.index : config.doc.alias, config.doc.type, config.doc.id).setPayloads(config.requestPayloads) .setOffsets(config.requestOffsets).setPositions(config.requestPositions).setFieldStatistics(true).setTermStatistics(true) .setSelectedFields(config.selectedFields).setRealtime(false); } protected Fields getTermVectorsFromLucene(DirectoryReader directoryReader, TestDoc doc) throws IOException { IndexSearcher searcher = new IndexSearcher(directoryReader); TopDocs search = searcher.search(new TermQuery(new Term("id", doc.id)), 1); ScoreDoc[] scoreDocs = search.scoreDocs; assertEquals(1, scoreDocs.length); return directoryReader.getTermVectors(scoreDocs[0].doc); } }