DistinctValuesCollectorTest.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.search.grouping;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.grouping.function.FunctionDistinctValuesCollector;
import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermDistinctValuesCollector;
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.mutable.MutableValue;
import org.apache.lucene.util.mutable.MutableValueStr;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.Set;

public class DistinctValuesCollectorTest extends AbstractGroupingTestCase {

  private final static NullComparator nullComparator = new NullComparator();
  
  private final String groupField = "author";
  private final String dvGroupField = "author_dv";
  private final String countField = "publisher";
  private final String dvCountField = "publisher_dv";

  public void testSimple() throws Exception {
    Random random = random();
    DocValuesType[] dvTypes = new DocValuesType[]{
        DocValuesType.NUMERIC,
        DocValuesType.BINARY,
        DocValuesType.SORTED,
    };
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(
        random,
        dir,
        newIndexWriterConfig(TEST_VERSION_CURRENT,
            new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
    boolean canUseDV = true;
    DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;

    Document doc = new Document();
    addField(doc, groupField, "1", dvType);
    addField(doc, countField, "1", dvType);
    doc.add(new TextField("content", "random text", Field.Store.NO));
    doc.add(new StringField("id", "1", Field.Store.NO));
    w.addDocument(doc);

    // 1
    doc = new Document();
    addField(doc, groupField, "1", dvType);
    addField(doc, countField, "1", dvType);
    doc.add(new TextField("content", "some more random text blob", Field.Store.NO));
    doc.add(new StringField("id", "2", Field.Store.NO));
    w.addDocument(doc);

    // 2
    doc = new Document();
    addField(doc, groupField, "1", dvType);
    addField(doc, countField, "2", dvType);
    doc.add(new TextField("content", "some more random textual data", Field.Store.NO));
    doc.add(new StringField("id", "3", Field.Store.NO));
    w.addDocument(doc);
    w.commit(); // To ensure a second segment

    // 3
    doc = new Document();
    addField(doc, groupField, "2", dvType);
    doc.add(new TextField("content", "some random text", Field.Store.NO));
    doc.add(new StringField("id", "4", Field.Store.NO));
    w.addDocument(doc);

    // 4
    doc = new Document();
    addField(doc, groupField, "3", dvType);
    addField(doc, countField, "1", dvType);
    doc.add(new TextField("content", "some more random text", Field.Store.NO));
    doc.add(new StringField("id", "5", Field.Store.NO));
    w.addDocument(doc);

    // 5
    doc = new Document();
    addField(doc, groupField, "3", dvType);
    addField(doc, countField, "1", dvType);
    doc.add(new TextField("content", "random blob", Field.Store.NO));
    doc.add(new StringField("id", "6", Field.Store.NO));
    w.addDocument(doc);

    // 6 -- no author field
    doc = new Document();
    doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES));
    addField(doc, countField, "1", dvType);
    doc.add(new StringField("id", "6", Field.Store.NO));
    w.addDocument(doc);

    IndexSearcher indexSearcher = newSearcher(w.getReader());
    w.close();

    Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> cmp = new Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>>() {

      @Override
      public int compare(AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount1, AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount2) {
        if (groupCount1.groupValue == null) {
          if (groupCount2.groupValue == null) {
            return 0;
          }
          return -1;
        } else if (groupCount2.groupValue == null) {
          return 1;
        } else {
          return groupCount1.groupValue.compareTo(groupCount2.groupValue);
        }
      }

    };

    // === Search for content:random
    AbstractFirstPassGroupingCollector<Comparable<Object>> firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
    indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector);
    AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> distinctValuesCollector
        = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
    indexSearcher.search(new TermQuery(new Term("content", "random")), distinctValuesCollector);

    List<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> gcs =  distinctValuesCollector.getGroups();
    Collections.sort(gcs, cmp);
    assertEquals(4, gcs.size());

    compareNull(gcs.get(0).groupValue);
    List<Comparable<?>> countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
    assertEquals(1, countValues.size());
    compare("1", countValues.get(0));

    compare("1", gcs.get(1).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
    Collections.sort(countValues, nullComparator);
    assertEquals(2, countValues.size());
    compare("1", countValues.get(0));
    compare("2", countValues.get(1));

    compare("2", gcs.get(2).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
    assertEquals(1, countValues.size());
    compareNull(countValues.get(0));

    compare("3", gcs.get(3).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(3).uniqueValues);
    assertEquals(1, countValues.size());
    compare("1", countValues.get(0));

    // === Search for content:some
    firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
    indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector);
    distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
    indexSearcher.search(new TermQuery(new Term("content", "some")), distinctValuesCollector);

    gcs = distinctValuesCollector.getGroups();
    Collections.sort(gcs, cmp);
    assertEquals(3, gcs.size());

    compare("1", gcs.get(0).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
    assertEquals(2, countValues.size());
    Collections.sort(countValues, nullComparator);
    compare("1", countValues.get(0));
    compare("2", countValues.get(1));

    compare("2", gcs.get(1).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
    assertEquals(1, countValues.size());
    compareNull(countValues.get(0));

    compare("3", gcs.get(2).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
    assertEquals(1, countValues.size());
    compare("1", countValues.get(0));

     // === Search for content:blob
    firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
    indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector);
    distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
    indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctValuesCollector);

    gcs = distinctValuesCollector.getGroups();
    Collections.sort(gcs, cmp);
    assertEquals(2, gcs.size());

    compare("1", gcs.get(0).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
    // B/c the only one document matched with blob inside the author 1 group
    assertEquals(1, countValues.size());
    compare("1", countValues.get(0));

    compare("3", gcs.get(1).groupValue);
    countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
    assertEquals(1, countValues.size());
    compare("1", countValues.get(0));

    indexSearcher.getIndexReader().close();
    dir.close();
  }

  public void testRandom() throws Exception {
    Random random = random();
    int numberOfRuns = TestUtil.nextInt(random, 3, 6);
    for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) {
      IndexContext context = createIndexContext();
      for (int searchIter = 0; searchIter < 100; searchIter++) {
        final IndexSearcher searcher = newSearcher(context.indexReader);
        boolean useDv = context.dvType != null && random.nextBoolean();
        DocValuesType dvType = useDv ? context.dvType : null;
        String term = context.contentStrings[random.nextInt(context.contentStrings.length)];
        Sort groupSort = new Sort(new SortField("id", SortField.Type.STRING));
        int topN = 1 + random.nextInt(10);

        List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> expectedResult = createExpectedResult(context, term, groupSort, topN);

        AbstractFirstPassGroupingCollector<Comparable<?>> firstCollector = createRandomFirstPassCollector(dvType, groupSort, groupField, topN);
        searcher.search(new TermQuery(new Term("content", term)), firstCollector);
        AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> distinctValuesCollector
            = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
        searcher.search(new TermQuery(new Term("content", term)), distinctValuesCollector);
        @SuppressWarnings("unchecked")
        List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> actualResult = (List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>>) distinctValuesCollector.getGroups();

        if (VERBOSE) {
          System.out.println("Index iter=" + indexIter);
          System.out.println("Search iter=" + searchIter);
          System.out.println("1st pass collector class name=" + firstCollector.getClass().getName());
          System.out.println("2nd pass collector class name=" + distinctValuesCollector.getClass().getName());
          System.out.println("Search term=" + term);
          System.out.println("DVType=" + dvType);
          System.out.println("1st pass groups=" + firstCollector.getTopGroups(0, false));
          System.out.println("Expected:");      
          printGroups(expectedResult);
          System.out.println("Actual:");      
          printGroups(actualResult);
        }

        assertEquals(expectedResult.size(), actualResult.size());
        for (int i = 0; i < expectedResult.size(); i++) {
          AbstractDistinctValuesCollector.GroupCount<Comparable<?>> expected = expectedResult.get(i);
          AbstractDistinctValuesCollector.GroupCount<Comparable<?>> actual = actualResult.get(i);
          assertValues(expected.groupValue, actual.groupValue);
          assertEquals(expected.uniqueValues.size(), actual.uniqueValues.size());
          List<Comparable<?>> expectedUniqueValues = new ArrayList<>(expected.uniqueValues);
          Collections.sort(expectedUniqueValues, nullComparator);
          List<Comparable<?>> actualUniqueValues = new ArrayList<>(actual.uniqueValues);
          Collections.sort(actualUniqueValues, nullComparator);
          for (int j = 0; j < expectedUniqueValues.size(); j++) {
            assertValues(expectedUniqueValues.get(j), actualUniqueValues.get(j));
          }
        }
      }
      context.indexReader.close();
      context.directory.close();
    }
  }

  private void printGroups(List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> results) {
    for(int i=0;i<results.size();i++) {
      AbstractDistinctValuesCollector.GroupCount<Comparable<?>> group = results.get(i);
      Object gv = group.groupValue;
      if (gv instanceof BytesRef) {
        System.out.println(i + ": groupValue=" + ((BytesRef) gv).utf8ToString());
      } else {
        System.out.println(i + ": groupValue=" + gv);
      }
      for(Object o : group.uniqueValues) {
        if (o instanceof BytesRef) {
          System.out.println("  " + ((BytesRef) o).utf8ToString());
        } else {
          System.out.println("  " + o);
        }
      }
    }
  }

  private void assertValues(Object expected, Object actual) {
    if (expected == null) {
      compareNull(actual);
    } else {
      compare(((BytesRef) expected).utf8ToString(), actual);
    }
  }
  
  private void compare(String expected, Object groupValue) {
    if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
      assertEquals(expected, ((BytesRef) groupValue).utf8ToString());
    } else if (Double.class.isAssignableFrom(groupValue.getClass())) {
      assertEquals(Double.parseDouble(expected), groupValue);
    } else if (Long.class.isAssignableFrom(groupValue.getClass())) {
      assertEquals(Long.parseLong(expected), groupValue);
    } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
      MutableValueStr mutableValue = new MutableValueStr();
      mutableValue.value = new BytesRef(expected);
      assertEquals(mutableValue, groupValue);
    } else {
      fail();
    }
  }

  private void compareNull(Object groupValue) {
    if (groupValue == null) {
      return; // term based impl...
    }
    // DV based impls..
    if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
      assertEquals("", ((BytesRef) groupValue).utf8ToString());
    } else if (Double.class.isAssignableFrom(groupValue.getClass())) {
      assertEquals(0.0d, groupValue);
    } else if (Long.class.isAssignableFrom(groupValue.getClass())) {
      assertEquals(0L, groupValue);
      // Function based impl
    } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
      assertFalse(((MutableValue) groupValue).exists());
    } else {
      fail();
    }
  }

  private void addField(Document doc, String field, String value, DocValuesType type) {
    doc.add(new StringField(field, value, Field.Store.YES));
    if (type == null) {
      return;
    }
    String dvField = field + "_dv";

    Field valuesField = null;
    switch (type) {
      case NUMERIC:
        valuesField = new NumericDocValuesField(dvField, Integer.parseInt(value));
        break;
      case BINARY:
        valuesField = new BinaryDocValuesField(dvField, new BytesRef(value));
        break;
      case SORTED:
        valuesField = new SortedDocValuesField(dvField, new BytesRef(value));
        break;
    }
    doc.add(valuesField);
  }

  @SuppressWarnings({"unchecked","rawtypes"})
  private <T extends Comparable> AbstractDistinctValuesCollector<AbstractDistinctValuesCollector.GroupCount<T>> createDistinctCountCollector(AbstractFirstPassGroupingCollector<T> firstPassGroupingCollector,
                                                                      String groupField,
                                                                      String countField,
                                                                      DocValuesType dvType) {
    Random random = random();
    Collection<SearchGroup<T>> searchGroups = firstPassGroupingCollector.getTopGroups(0, false);
    if (FunctionFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
      return (AbstractDistinctValuesCollector) new FunctionDistinctValuesCollector(new HashMap<>(), new BytesRefFieldSource(groupField), new BytesRefFieldSource(countField), (Collection) searchGroups);
    } else {
      return (AbstractDistinctValuesCollector) new TermDistinctValuesCollector(groupField, countField, (Collection) searchGroups);
    }
  }

  @SuppressWarnings({"unchecked","rawtypes"})
  private <T> AbstractFirstPassGroupingCollector<T> createRandomFirstPassCollector(DocValuesType dvType, Sort groupSort, String groupField, int topNGroups) throws IOException {
    Random random = random();
    if (dvType != null) {
      if (random.nextBoolean()) {
        return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<>(), groupSort, topNGroups);
      } else {
        return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
      }
    } else {
      if (random.nextBoolean()) {
        return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<>(), groupSort, topNGroups);
      } else {
        return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
      }
    }
  }

  @SuppressWarnings({"unchecked","rawtypes"})
  private List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> createExpectedResult(IndexContext context,  String term, Sort groupSort, int topN) {
    class GroupCount extends AbstractDistinctValuesCollector.GroupCount<BytesRef> {
      GroupCount(BytesRef groupValue, Collection<BytesRef> uniqueValues) {
        super(groupValue);
        this.uniqueValues.addAll(uniqueValues);
      }
    }

    List result = new ArrayList();
    Map<String, Set<String>> groupCounts = context.searchTermToGroupCounts.get(term);
    int i = 0;
    for (String group : groupCounts.keySet()) {
      if (topN <= i++) {
        break;
      }
      Set<BytesRef> uniqueValues = new HashSet<>();
      for (String val : groupCounts.get(group)) {
        uniqueValues.add(val != null ? new BytesRef(val) : null);
      }
      result.add(new GroupCount(group != null ? new BytesRef(group) : null, uniqueValues));
    }
    return result;
  }

  private IndexContext createIndexContext() throws Exception {
    Random random = random();
    DocValuesType[] dvTypes = new DocValuesType[]{
        DocValuesType.BINARY,
        DocValuesType.SORTED
    };

    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(
        random,
        dir,
        newIndexWriterConfig(TEST_VERSION_CURRENT,
        new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())
      );

    boolean canUseDV = true;
    DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;

    int numDocs = 86 + random.nextInt(1087) * RANDOM_MULTIPLIER;
    String[] groupValues = new String[numDocs / 5];
    String[] countValues = new String[numDocs / 10];
    for (int i = 0; i < groupValues.length; i++) {
      groupValues[i] = generateRandomNonEmptyString();
    }
    for (int i = 0; i < countValues.length; i++) {
      countValues[i] = generateRandomNonEmptyString();
    }
    
    List<String> contentStrings = new ArrayList<>();
    Map<String, Map<String, Set<String>>> searchTermToGroupCounts = new HashMap<>();
    for (int i = 1; i <= numDocs; i++) {
      String groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.length)];
      String countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.length)];
      String content = "random" + random.nextInt(numDocs / 20);
      Map<String, Set<String>> groupToCounts = searchTermToGroupCounts.get(content);
      if (groupToCounts == null) {
        // Groups sort always DOCID asc...
        searchTermToGroupCounts.put(content, groupToCounts = new LinkedHashMap<>());
        contentStrings.add(content);
      }

      Set<String> countsVals = groupToCounts.get(groupValue);
      if (countsVals == null) {
        groupToCounts.put(groupValue, countsVals = new HashSet<>());
      }
      countsVals.add(countValue);

      Document doc = new Document();
      doc.add(new StringField("id", String.format(Locale.ROOT, "%09d", i), Field.Store.YES));
      if (groupValue != null) {
        addField(doc, groupField, groupValue, dvType);
      }
      if (countValue != null) {
        addField(doc, countField, countValue, dvType);
      }
      doc.add(new TextField("content", content, Field.Store.YES));
      w.addDocument(doc);
    }

    DirectoryReader reader = w.getReader();
    if (VERBOSE) {
      for(int docID=0;docID<reader.maxDoc();docID++) {
        StoredDocument doc = reader.document(docID);
        System.out.println("docID=" + docID + " id=" + doc.get("id") + " content=" + doc.get("content") + " author=" + doc.get("author") + " publisher=" + doc.get("publisher"));
      }
    }

    w.close();
    return new IndexContext(dir, reader, dvType, searchTermToGroupCounts, contentStrings.toArray(new String[contentStrings.size()]));
  }

  private static class IndexContext {

    final Directory directory;
    final DirectoryReader indexReader;
    final DocValuesType dvType;
    final Map<String, Map<String, Set<String>>> searchTermToGroupCounts;
    final String[] contentStrings;

    IndexContext(Directory directory, DirectoryReader indexReader, DocValuesType dvType,
                 Map<String, Map<String, Set<String>>> searchTermToGroupCounts, String[] contentStrings) {
      this.directory = directory;
      this.indexReader = indexReader;
      this.dvType = dvType;
      this.searchTermToGroupCounts = searchTermToGroupCounts;
      this.contentStrings = contentStrings;
    }
  }

  private static class NullComparator implements Comparator<Comparable<?>> {

    @Override
    @SuppressWarnings({"unchecked","rawtypes"})
    public int compare(Comparable a, Comparable b) {
      if (a == b) {
        return 0;
      } else if (a == null) {
        return -1;
      } else if (b == null) {
        return 1;
      } else {
        return a.compareTo(b);
      }
    }

  }

}