/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.lucene.index;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.test.ESTestCase;
import org.junit.After;
import org.junit.Before;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
public class FreqTermsEnumTests extends ESTestCase {
private String[] terms;
private IndexWriter iw;
private IndexReader reader;
private Map<String, FreqHolder> referenceAll;
private Map<String, FreqHolder> referenceNotDeleted;
private Map<String, FreqHolder> referenceFilter;
private Query filter;
static class FreqHolder {
int docFreq;
long totalTermFreq;
}
@Before
@Override
public void setUp() throws Exception {
super.setUp();
referenceAll = new HashMap<>();
referenceNotDeleted = new HashMap<>();
referenceFilter = new HashMap<>();
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new KeywordAnalyzer()); // use keyword analyzer we rely on the stored field holding the exact term.
if (frequently()) {
// we don't want to do any merges, so we won't expunge deletes
conf.setMergePolicy(NoMergePolicy.INSTANCE);
}
iw = new IndexWriter(dir, conf);
terms = new String[scaledRandomIntBetween(10, 300)];
for (int i = 0; i < terms.length; i++) {
terms[i] = randomAlphaOfLength(5);
}
int numberOfDocs = scaledRandomIntBetween(30, 300);
Document[] docs = new Document[numberOfDocs];
for (int i = 0; i < numberOfDocs; i++) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Field.Store.YES));
docs[i] = doc;
for (String term : terms) {
if (randomBoolean()) {
continue;
}
int freq = randomIntBetween(1, 3);
for (int j = 0; j < freq; j++) {
doc.add(new TextField("field", term, Field.Store.YES));
}
}
}
// add all docs
for (int i = 0; i < docs.length; i++) {
Document doc = docs[i];
iw.addDocument(doc);
if (rarely()) {
iw.commit();
}
}
Set<String> deletedIds = new HashSet<>();
for (int i = 0; i < docs.length; i++) {
Document doc = docs[i];
if (randomInt(5) == 2) {
Term idTerm = new Term("id", doc.getField("id").stringValue());
deletedIds.add(idTerm.text());
iw.deleteDocuments(idTerm);
}
}
for (String term : terms) {
referenceAll.put(term, new FreqHolder());
referenceFilter.put(term, new FreqHolder());
referenceNotDeleted.put(term, new FreqHolder());
}
// now go over each doc, build the relevant references and filter
reader = DirectoryReader.open(iw);
List<BytesRef> filterTerms = new ArrayList<>();
for (int docId = 0; docId < reader.maxDoc(); docId++) {
Document doc = reader.document(docId);
addFreqs(doc, referenceAll);
if (!deletedIds.contains(doc.getField("id").stringValue())) {
addFreqs(doc, referenceNotDeleted);
if (randomBoolean()) {
filterTerms.add(new BytesRef(doc.getField("id").stringValue()));
addFreqs(doc, referenceFilter);
}
}
}
filter = new TermInSetQuery("id",filterTerms);
}
private void addFreqs(Document doc, Map<String, FreqHolder> reference) {
Set<String> addedDocFreq = new HashSet<>();
for (IndexableField field : doc.getFields("field")) {
String term = field.stringValue();
FreqHolder freqHolder = reference.get(term);
if (!addedDocFreq.contains(term)) {
freqHolder.docFreq++;
addedDocFreq.add(term);
}
freqHolder.totalTermFreq++;
}
}
@After
@Override
public void tearDown() throws Exception {
IOUtils.close(reader, iw, iw.getDirectory());
super.tearDown();
}
public void testAllFreqs() throws Exception {
assertAgainstReference(true, true, null, referenceAll);
assertAgainstReference(true, false, null, referenceAll);
assertAgainstReference(false, true, null, referenceAll);
}
public void testNonDeletedFreqs() throws Exception {
assertAgainstReference(true, true, Queries.newMatchAllQuery(), referenceNotDeleted);
assertAgainstReference(true, false, Queries.newMatchAllQuery(), referenceNotDeleted);
assertAgainstReference(false, true, Queries.newMatchAllQuery(), referenceNotDeleted);
}
public void testFilterFreqs() throws Exception {
assertAgainstReference(true, true, filter, referenceFilter);
assertAgainstReference(true, false, filter, referenceFilter);
assertAgainstReference(false, true, filter, referenceFilter);
}
private void assertAgainstReference(boolean docFreq, boolean totalTermFreq, Query filter, Map<String, FreqHolder> reference) throws Exception {
FreqTermsEnum freqTermsEnum = new FreqTermsEnum(reader, "field", docFreq, totalTermFreq, filter, BigArrays.NON_RECYCLING_INSTANCE);
assertAgainstReference(freqTermsEnum, reference, docFreq, totalTermFreq);
}
private void assertAgainstReference(FreqTermsEnum termsEnum, Map<String, FreqHolder> reference, boolean docFreq, boolean totalTermFreq) throws Exception {
int cycles = randomIntBetween(1, 5);
for (int i = 0; i < cycles; i++) {
List<String> terms = new ArrayList<>(Arrays.asList(this.terms));
Collections.shuffle(terms, random());
for (String term : terms) {
if (!termsEnum.seekExact(new BytesRef(term))) {
assertThat("term : " + term, reference.get(term).docFreq, is(0));
continue;
}
if (docFreq) {
assertThat("cycle " + i + ", term " + term + ", docFreq", termsEnum.docFreq(), equalTo(reference.get(term).docFreq));
}
if (totalTermFreq) {
assertThat("cycle " + i + ", term " + term + ", totalTermFreq", termsEnum.totalTermFreq(), equalTo(reference.get(term).totalTermFreq));
}
}
}
}
}