/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.grouping; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.mutable.MutableValue; import org.apache.lucene.util.mutable.MutableValueStr; public class DistinctValuesCollectorTest extends AbstractGroupingTestCase { private final static NullComparator nullComparator = new NullComparator(); private static final String GROUP_FIELD = "author"; private static final String COUNT_FIELD = "publisher"; public void testSimple() throws Exception { Random random = random(); Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter( random, dir, newIndexWriterConfig(new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); Document doc = new Document(); addField(doc, GROUP_FIELD, "1"); addField(doc, COUNT_FIELD, "1"); doc.add(new TextField("content", "random text", Field.Store.NO)); doc.add(new StringField("id", "1", Field.Store.NO)); w.addDocument(doc); // 1 doc = new Document(); addField(doc, GROUP_FIELD, "1"); addField(doc, COUNT_FIELD, "1"); doc.add(new TextField("content", "some more random text blob", Field.Store.NO)); doc.add(new StringField("id", "2", Field.Store.NO)); w.addDocument(doc); // 2 doc = new Document(); addField(doc, GROUP_FIELD, "1"); addField(doc, COUNT_FIELD, "2"); doc.add(new TextField("content", "some more random textual data", Field.Store.NO)); doc.add(new StringField("id", "3", Field.Store.NO)); w.addDocument(doc); w.commit(); // To ensure a second segment // 3 -- no count field doc = new Document(); addField(doc, GROUP_FIELD, "2"); doc.add(new TextField("content", "some random text", Field.Store.NO)); doc.add(new StringField("id", "4", Field.Store.NO)); w.addDocument(doc); // 4 doc = new Document(); addField(doc, GROUP_FIELD, "3"); addField(doc, COUNT_FIELD, "1"); doc.add(new TextField("content", "some more random text", Field.Store.NO)); doc.add(new StringField("id", "5", Field.Store.NO)); w.addDocument(doc); // 5 doc = new Document(); addField(doc, GROUP_FIELD, "3"); addField(doc, COUNT_FIELD, "1"); doc.add(new TextField("content", "random blob", Field.Store.NO)); doc.add(new StringField("id", "6", Field.Store.NO)); w.addDocument(doc); // 6 -- no author field doc = new Document(); doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES)); addField(doc, COUNT_FIELD, "1"); doc.add(new StringField("id", "6", Field.Store.NO)); w.addDocument(doc); IndexSearcher indexSearcher = newSearcher(w.getReader()); w.close(); Comparator<DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>>> cmp = (groupCount1, groupCount2) -> { if (groupCount1.groupValue == null) { if (groupCount2.groupValue == null) { return 0; } return -1; } else if (groupCount2.groupValue == null) { return 1; } else { return groupCount1.groupValue.compareTo(groupCount2.groupValue); } }; // === Search for content:random FirstPassGroupingCollector<Comparable<Object>> firstCollector = createRandomFirstPassCollector(new Sort(), GROUP_FIELD, 10); indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector); DistinctValuesCollector<Comparable<Object>, Comparable<Object>> distinctValuesCollector = createDistinctCountCollector(firstCollector, COUNT_FIELD); indexSearcher.search(new TermQuery(new Term("content", "random")), distinctValuesCollector); List<DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>>> gcs = distinctValuesCollector.getGroups(); Collections.sort(gcs, cmp); assertEquals(4, gcs.size()); compareNull(gcs.get(0).groupValue); List<Comparable<?>> countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues); assertEquals(1, countValues.size()); compare("1", countValues.get(0)); compare("1", gcs.get(1).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues); Collections.sort(countValues, nullComparator); assertEquals(2, countValues.size()); compare("1", countValues.get(0)); compare("2", countValues.get(1)); compare("2", gcs.get(2).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues); assertEquals(1, countValues.size()); compareNull(countValues.get(0)); compare("3", gcs.get(3).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(3).uniqueValues); assertEquals(1, countValues.size()); compare("1", countValues.get(0)); // === Search for content:some firstCollector = createRandomFirstPassCollector(new Sort(), GROUP_FIELD, 10); indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector); distinctValuesCollector = createDistinctCountCollector(firstCollector, COUNT_FIELD); indexSearcher.search(new TermQuery(new Term("content", "some")), distinctValuesCollector); gcs = distinctValuesCollector.getGroups(); Collections.sort(gcs, cmp); assertEquals(3, gcs.size()); compare("1", gcs.get(0).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues); assertEquals(2, countValues.size()); Collections.sort(countValues, nullComparator); compare("1", countValues.get(0)); compare("2", countValues.get(1)); compare("2", gcs.get(1).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues); assertEquals(1, countValues.size()); compareNull(countValues.get(0)); compare("3", gcs.get(2).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues); assertEquals(1, countValues.size()); compare("1", countValues.get(0)); // === Search for content:blob firstCollector = createRandomFirstPassCollector(new Sort(), GROUP_FIELD, 10); indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector); distinctValuesCollector = createDistinctCountCollector(firstCollector, COUNT_FIELD); indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctValuesCollector); gcs = distinctValuesCollector.getGroups(); Collections.sort(gcs, cmp); assertEquals(2, gcs.size()); compare("1", gcs.get(0).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues); // B/c the only one document matched with blob inside the author 1 group assertEquals(1, countValues.size()); compare("1", countValues.get(0)); compare("3", gcs.get(1).groupValue); countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues); assertEquals(1, countValues.size()); compare("1", countValues.get(0)); indexSearcher.getIndexReader().close(); dir.close(); } public void testRandom() throws Exception { Random random = random(); int numberOfRuns = TestUtil.nextInt(random, 3, 6); for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) { IndexContext context = createIndexContext(); for (int searchIter = 0; searchIter < 100; searchIter++) { final IndexSearcher searcher = newSearcher(context.indexReader); String term = context.contentStrings[random.nextInt(context.contentStrings.length)]; Sort groupSort = new Sort(new SortField("id", SortField.Type.STRING)); int topN = 1 + random.nextInt(10); List<DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>>> expectedResult = createExpectedResult(context, term, groupSort, topN); FirstPassGroupingCollector<Comparable<Object>> firstCollector = createRandomFirstPassCollector(groupSort, GROUP_FIELD, topN); searcher.search(new TermQuery(new Term("content", term)), firstCollector); DistinctValuesCollector<Comparable<Object>, Comparable<Object>> distinctValuesCollector = createDistinctCountCollector(firstCollector, COUNT_FIELD); searcher.search(new TermQuery(new Term("content", term)), distinctValuesCollector); @SuppressWarnings("unchecked") List<DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>>> actualResult = distinctValuesCollector.getGroups(); if (VERBOSE) { System.out.println("Index iter=" + indexIter); System.out.println("Search iter=" + searchIter); System.out.println("1st pass collector class name=" + firstCollector.getClass().getName()); System.out.println("2nd pass collector class name=" + distinctValuesCollector.getClass().getName()); System.out.println("Search term=" + term); System.out.println("1st pass groups=" + firstCollector.getTopGroups(0, false)); System.out.println("Expected:"); printGroups(expectedResult); System.out.println("Actual:"); printGroups(actualResult); } assertEquals(expectedResult.size(), actualResult.size()); for (int i = 0; i < expectedResult.size(); i++) { DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>> expected = expectedResult.get(i); DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>> actual = actualResult.get(i); assertValues(expected.groupValue, actual.groupValue); assertEquals(expected.uniqueValues.size(), actual.uniqueValues.size()); List<Comparable<?>> expectedUniqueValues = new ArrayList<>(expected.uniqueValues); Collections.sort(expectedUniqueValues, nullComparator); List<Comparable<?>> actualUniqueValues = new ArrayList<>(actual.uniqueValues); Collections.sort(actualUniqueValues, nullComparator); for (int j = 0; j < expectedUniqueValues.size(); j++) { assertValues(expectedUniqueValues.get(j), actualUniqueValues.get(j)); } } } context.indexReader.close(); context.directory.close(); } } private void printGroups(List<DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>>> results) { for(int i=0;i<results.size();i++) { DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>> group = results.get(i); Object gv = group.groupValue; if (gv instanceof BytesRef) { System.out.println(i + ": groupValue=" + ((BytesRef) gv).utf8ToString()); } else { System.out.println(i + ": groupValue=" + gv); } for(Object o : group.uniqueValues) { if (o instanceof BytesRef) { System.out.println(" " + ((BytesRef) o).utf8ToString()); } else { System.out.println(" " + o); } } } } private void assertValues(Object expected, Object actual) { if (expected == null) { compareNull(actual); } else { compare(((BytesRef) expected).utf8ToString(), actual); } } private void compare(String expected, Object groupValue) { if (BytesRef.class.isAssignableFrom(groupValue.getClass())) { assertEquals(expected, ((BytesRef) groupValue).utf8ToString()); } else if (Double.class.isAssignableFrom(groupValue.getClass())) { assertEquals(Double.parseDouble(expected), groupValue); } else if (Long.class.isAssignableFrom(groupValue.getClass())) { assertEquals(Long.parseLong(expected), groupValue); } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) { MutableValueStr mutableValue = new MutableValueStr(); mutableValue.value.copyChars(expected); assertEquals(mutableValue, groupValue); } else { fail(); } } private void compareNull(Object groupValue) { if (groupValue == null) { return; // term based impl... } // DV based impls.. if (BytesRef.class.isAssignableFrom(groupValue.getClass())) { assertEquals("", ((BytesRef) groupValue).utf8ToString()); } else if (Double.class.isAssignableFrom(groupValue.getClass())) { assertEquals(0.0d, groupValue); } else if (Long.class.isAssignableFrom(groupValue.getClass())) { assertEquals(0L, groupValue); // Function based impl } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) { assertFalse(((MutableValue) groupValue).exists()); } else { fail(); } } private void addField(Document doc, String field, String value) { doc.add(new SortedDocValuesField(field, new BytesRef(value))); } @SuppressWarnings({"unchecked","rawtypes"}) private <T extends Comparable<Object>, R extends Comparable<Object>> DistinctValuesCollector<T, R> createDistinctCountCollector(FirstPassGroupingCollector<T> firstPassGroupingCollector, String countField) throws IOException { Collection<SearchGroup<T>> searchGroups = firstPassGroupingCollector.getTopGroups(0, false); GroupSelector<T> selector = firstPassGroupingCollector.getGroupSelector(); if (ValueSourceGroupSelector.class.isAssignableFrom(selector.getClass())) { GroupSelector gs = new ValueSourceGroupSelector(new BytesRefFieldSource(countField), new HashMap<>()); return new DistinctValuesCollector<>(selector, searchGroups, gs); } else { GroupSelector ts = new TermGroupSelector(countField); return new DistinctValuesCollector<>(selector, searchGroups, ts); } } @SuppressWarnings({"unchecked","rawtypes"}) private <T> FirstPassGroupingCollector<T> createRandomFirstPassCollector(Sort groupSort, String groupField, int topNGroups) throws IOException { Random random = random(); if (random.nextBoolean()) { return (FirstPassGroupingCollector<T>) new FirstPassGroupingCollector<>(new ValueSourceGroupSelector(new BytesRefFieldSource(groupField), new HashMap<>()), groupSort, topNGroups); } else { return (FirstPassGroupingCollector<T>) new FirstPassGroupingCollector<>(new TermGroupSelector(groupField), groupSort, topNGroups); } } @SuppressWarnings({"unchecked","rawtypes"}) private List<DistinctValuesCollector.GroupCount<Comparable<Object>, Comparable<Object>>> createExpectedResult(IndexContext context, String term, Sort groupSort, int topN) { List result = new ArrayList(); Map<String, Set<String>> groupCounts = context.searchTermToGroupCounts.get(term); int i = 0; for (String group : groupCounts.keySet()) { if (topN <= i++) { break; } Set<BytesRef> uniqueValues = new HashSet<>(); for (String val : groupCounts.get(group)) { uniqueValues.add(val != null ? new BytesRef(val) : null); } result.add(new DistinctValuesCollector.GroupCount(group != null ? new BytesRef(group) : null, uniqueValues)); } return result; } private IndexContext createIndexContext() throws Exception { Random random = random(); Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter( random, dir, newIndexWriterConfig(new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()) ); int numDocs = 86 + random.nextInt(1087) * RANDOM_MULTIPLIER; String[] groupValues = new String[numDocs / 5]; String[] countValues = new String[numDocs / 10]; for (int i = 0; i < groupValues.length; i++) { groupValues[i] = generateRandomNonEmptyString(); } for (int i = 0; i < countValues.length; i++) { countValues[i] = generateRandomNonEmptyString(); } List<String> contentStrings = new ArrayList<>(); Map<String, Map<String, Set<String>>> searchTermToGroupCounts = new HashMap<>(); for (int i = 1; i <= numDocs; i++) { String groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.length)]; String countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.length)]; String content = "random" + random.nextInt(numDocs / 20); Map<String, Set<String>> groupToCounts = searchTermToGroupCounts.get(content); if (groupToCounts == null) { // Groups sort always DOCID asc... searchTermToGroupCounts.put(content, groupToCounts = new LinkedHashMap<>()); contentStrings.add(content); } Set<String> countsVals = groupToCounts.get(groupValue); if (countsVals == null) { groupToCounts.put(groupValue, countsVals = new HashSet<>()); } countsVals.add(countValue); Document doc = new Document(); doc.add(new StringField("id", String.format(Locale.ROOT, "%09d", i), Field.Store.YES)); doc.add(new SortedDocValuesField("id", new BytesRef(String.format(Locale.ROOT, "%09d", i)))); if (groupValue != null) { addField(doc, GROUP_FIELD, groupValue); } if (countValue != null) { addField(doc, COUNT_FIELD, countValue); } doc.add(new TextField("content", content, Field.Store.YES)); w.addDocument(doc); } DirectoryReader reader = w.getReader(); if (VERBOSE) { for(int docID=0;docID<reader.maxDoc();docID++) { Document doc = reader.document(docID); System.out.println("docID=" + docID + " id=" + doc.get("id") + " content=" + doc.get("content") + " author=" + doc.get("author") + " publisher=" + doc.get("publisher")); } } w.close(); return new IndexContext(dir, reader, searchTermToGroupCounts, contentStrings.toArray(new String[contentStrings.size()])); } private static class IndexContext { final Directory directory; final DirectoryReader indexReader; final Map<String, Map<String, Set<String>>> searchTermToGroupCounts; final String[] contentStrings; IndexContext(Directory directory, DirectoryReader indexReader, Map<String, Map<String, Set<String>>> searchTermToGroupCounts, String[] contentStrings) { this.directory = directory; this.indexReader = indexReader; this.searchTermToGroupCounts = searchTermToGroupCounts; this.contentStrings = contentStrings; } } private static class NullComparator implements Comparator<Comparable<?>> { @Override @SuppressWarnings({"unchecked","rawtypes"}) public int compare(Comparable a, Comparable b) { if (a == b) { return 0; } else if (a == null) { return -1; } else if (b == null) { return 1; } else { return a.compareTo(b); } } } }