package com.pearson.entech.elasticsearch.search.facet.approx.date; import static java.lang.Math.abs; import static java.lang.Math.max; import static java.lang.Math.pow; import static java.lang.Math.random; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; import static org.elasticsearch.node.NodeBuilder.nodeBuilder; import static org.hamcrest.CoreMatchers.equalTo; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.util.List; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import org.elasticsearch.ElasticSearchException; import org.elasticsearch.action.admin.indices.cache.clear.ClearIndicesCacheRequest; import org.elasticsearch.action.admin.indices.create.CreateIndexRequest; import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; import org.elasticsearch.action.admin.indices.optimize.OptimizeRequest; import org.elasticsearch.action.admin.indices.refresh.RefreshRequestBuilder; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.SearchType; import org.elasticsearch.client.Client; import org.elasticsearch.common.Priority; import org.elasticsearch.common.joda.time.DateTimeZone; import org.elasticsearch.common.joda.time.format.ISODateTimeFormat; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.query.FilterBuilder; import org.elasticsearch.index.query.FilterBuilders; import org.elasticsearch.node.Node; import org.elasticsearch.search.facet.FacetBuilder; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import com.google.common.base.Joiner; import com.pearson.entech.elasticsearch.search.facet.approx.date.external.DateFacetBuilder; import com.pearson.entech.elasticsearch.search.facet.approx.date.external.DistinctTimePeriod; import com.pearson.entech.elasticsearch.search.facet.approx.date.external.NullEntry; import com.pearson.entech.elasticsearch.search.facet.approx.date.internal.InternalDistinctFacet; public class RandomizedApproxReadWriteTest { private static Node __node; private static final long[] __days = { 1325376000000L, 1325376000000L + 86400000, 1325376000000L + 86400000 * 2, 1325376000000L + 86400000 * 3, 1325376000000L + 86400000 * 4, 1325376000000L + 86400000 * 5, 1325376000000L + 86400000 * 6, 1325376000000L + 86400000 * 7 }; private static final String __index = "myindex"; private static final String __type1 = "testtype"; private static final String __type2 = "anothertesttype"; private static final String __type3 = "yetanothertesttype"; private static final String __tsField = "timestamp"; private static final String __txtField = "txt"; private static final String __userField = "user"; private static final String __facetName = "histogram"; private static final AtomicInteger __counter = new AtomicInteger(0); private final Random _random = new Random(0); @BeforeClass public static void setUpClass() throws InterruptedException { final Settings settings = ImmutableSettings.settingsBuilder() .put("node.http.enabled", false) .put("index.gateway.type", "none") // Reluctantly removed this to reduce overall memory: // .put("index.store.type", "memory") .put("index.number_of_shards", 3) .put("index.number_of_replicas", 0) .put("index.merge.policy.merge_factor", 100) .put("path.data", "target") .put("refresh_interval", -1) .build(); __node = nodeBuilder() .local(true) .settings(settings) .clusterName("RandomizedApproxReadWriteTest") .node(); __node.start(); } @AfterClass public static void tearDownClass() { __node.close(); } @Before public void setUp() throws IOException { client().admin().indices().delete(new DeleteIndexRequest("_all")).actionGet(); client().admin().indices().create(new CreateIndexRequest(__index)).actionGet(); client().admin().cluster().prepareHealth().setWaitForGreenStatus().execute().actionGet(); final String mapping = XContentFactory.jsonBuilder() .startObject() .startObject(__type1) .startObject("_all").field("enabled", false).endObject() .startObject("_source").field("enabled", false).endObject() .startObject("properties") .startObject(__tsField).field("type", "date").field("store", "no").endObject() .startObject(__txtField).field("type", "string").field("store", "no").endObject() .startObject(__userField).field("type", "integer").field("store", "no").endObject() .endObject() .endObject() .endObject().string(); client().admin().indices() .preparePutMapping(__index) .setType(__type1) .setSource(mapping) .execute().actionGet(); client().admin().indices().clearCache( new ClearIndicesCacheRequest("_all")); System.gc(); assertEquals(0L, countAll()); } @Test public void testDateHistoFacetsCollectorMode() throws Exception { testDateHistoFacets(FacetBuilder.Mode.COLLECTOR); } @Test public void testDateHistoFacetsPostMode() throws Exception { testDateHistoFacets(FacetBuilder.Mode.POST); } private void testDateHistoFacets(final FacetBuilder.Mode mode) throws Exception { // Tests pass whether or not fields are explicitly mapped // final String mapping = jsonBuilder().startObject().startObject(__type2).startObject("properties") // .startObject("num").field("type", "integer").endObject() // .startObject("date").field("type", "date").endObject() // .endObject().endObject().endObject().string(); // client().admin().indices().preparePutMapping(__index).setType(__type2).setSource(mapping).execute().actionGet(); client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus().execute().actionGet(); client().prepareIndex(__index, __type2).setSource(jsonBuilder().startObject() .field("date", "2009-03-05T01:01:01") .field("num", 1) .endObject()).execute().actionGet(); client().admin().indices().prepareFlush().setRefresh(true).execute().actionGet(); client().prepareIndex(__index, __type2).setSource(jsonBuilder().startObject() .field("date", "2009-03-05T04:01:01") .field("num", 2) .endObject()).execute().actionGet(); client().admin().indices().prepareRefresh().execute().actionGet(); client().prepareIndex(__index, __type2).setSource(jsonBuilder().startObject() .field("date", "2009-03-06T01:01:01") .field("num", 3) .endObject()).execute().actionGet(); client().admin().indices().prepareRefresh().execute().actionGet(); final SearchResponse searchResponse = client() .prepareSearch() .setQuery(matchAllQuery()) .addFacet(new DateFacetBuilder("stats1").keyField("date").distinctField("num").interval("day").mode(mode)) .addFacet(new DateFacetBuilder("stats2").keyField("date").distinctField("num").interval("day").preZone("-02:00").mode(mode)) .addFacet(new DateFacetBuilder("stats3").keyField("date").distinctField("num").interval("day").preZone("-02:00").mode(mode)) // .addFacet( // new DateFacetBuilder("stats4").keyField("date").distinctScript("doc['num'].distinct * 2").interval("day").preZone("-02:00") // .mode(mode)) .addFacet(new DateFacetBuilder("stats5").keyField("date").distinctField("num").interval("24h").mode(mode)) .addFacet( new DateFacetBuilder("stats6").keyField("date").distinctField("num").interval("day").preZone("-02:00").postZone("-02:00") .mode(mode)) .addFacet(new DateFacetBuilder("stats7").keyField("date").distinctField("num").interval("quarter").mode(mode)) .execute().actionGet(); if(searchResponse.getFailedShards() > 0) { System.out.println(searchResponse); fail(Joiner.on(", ").join(searchResponse.getShardFailures())); } InternalDistinctFacet facet = searchResponse.getFacets().facet("stats1"); assertThat(facet.getName(), equalTo("stats1")); assertThat(facet.getEntries().size(), equalTo(2)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-05"))); assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(2l)); assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(2l)); assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-06"))); assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(1l)); assertThat(facet.getEntries().get(1).getDistinctCount(), equalTo(1l)); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); // time zone causes the dates to shift by 2 facet = searchResponse.getFacets().facet("stats2"); assertThat(facet.getName(), equalTo("stats2")); assertThat(facet.getEntries().size(), equalTo(2)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-04"))); assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(1l)); assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(1l)); assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-05"))); assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(2l)); assertThat(facet.getEntries().get(1).getDistinctCount(), equalTo(2l)); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); // time zone causes the dates to shift by 2 facet = searchResponse.getFacets().facet("stats3"); assertThat(facet.getName(), equalTo("stats3")); assertThat(facet.getEntries().size(), equalTo(2)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-04"))); assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(1l)); assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(1l)); assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-05"))); assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(2l)); assertThat(facet.getEntries().get(1).getDistinctCount(), equalTo(2l)); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); // time zone causes the dates to shift by 2 // facet = searchResponse.getFacets().facet("stats4"); // assertThat(facet.getName(), equalTo("stats4")); // assertThat(facet.getEntries().size(), equalTo(2)); // assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-04"))); // assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(1l)); // assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(1l)); // assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-05"))); // assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(2l)); // assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(2l)); facet = searchResponse.getFacets().facet("stats5"); assertThat(facet.getName(), equalTo("stats5")); assertThat(facet.getEntries().size(), equalTo(2)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-05"))); assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(2l)); assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(2l)); assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-06"))); assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(1l)); assertThat(facet.getEntries().get(1).getDistinctCount(), equalTo(1l)); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); facet = searchResponse.getFacets().facet("stats6"); assertThat(facet.getName(), equalTo("stats6")); assertThat(facet.getEntries().size(), equalTo(2)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-04") - TimeValue.timeValueHours(2).millis())); assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(1l)); assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(1l)); assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-05") - TimeValue.timeValueHours(2).millis())); assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(2l)); assertThat(facet.getEntries().get(1).getDistinctCount(), equalTo(2l)); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); facet = searchResponse.getFacets().facet("stats7"); assertThat(facet.getName(), equalTo("stats7")); assertThat(facet.getEntries().size(), equalTo(1)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-01-01"))); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); } @Test // https://github.com/elasticsearch/elasticsearch/issues/2141 public void testDateHistoFacets_preZoneBug() throws Exception { // Tests pass whether or not fields are explicitly mapped // final String mapping = jsonBuilder().startObject().startObject(__type3).startObject("properties") // .startObject("num").field("type", "integer").endObject() // .startObject("date").field("type", "date").endObject() // .endObject().endObject().endObject().string(); // client().admin().indices().preparePutMapping(__index).setType(__type3).setSource(mapping).execute().actionGet(); client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus().execute().actionGet(); client().prepareIndex(__index, __type3).setSource(jsonBuilder().startObject() .field("date", "2009-03-05T23:31:01") .field("num", 1) .endObject()).execute().actionGet(); client().admin().indices().prepareFlush().setRefresh(true).execute().actionGet(); client().prepareIndex(__index, __type3).setSource(jsonBuilder().startObject() .field("date", "2009-03-05T18:01:01") .field("num", 2) .endObject()).execute().actionGet(); client().admin().indices().prepareRefresh().execute().actionGet(); client().prepareIndex(__index, __type3).setSource(jsonBuilder().startObject() .field("date", "2009-03-05T22:01:01") .field("num", 3) .endObject()).execute().actionGet(); client().admin().indices().prepareRefresh().execute().actionGet(); final SearchResponse searchResponse = client().prepareSearch() .setQuery(matchAllQuery()) .addFacet(new DateFacetBuilder("stats1").keyField("date").distinctField("num").interval("day").preZone("+02:00")) .addFacet(new DateFacetBuilder("stats2").keyField("date").distinctField("num").interval("day").preZone("+01:30")) .execute().actionGet(); if(searchResponse.getFailedShards() > 0) { System.out.println(searchResponse); fail(Joiner.on(", ").join(searchResponse.getShardFailures())); } // time zone causes the dates to shift by 2:00 InternalDistinctFacet facet = searchResponse.getFacets().facet("stats1"); assertThat(facet.getName(), equalTo("stats1")); assertThat(facet.getEntries().size(), equalTo(2)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-05"))); assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(1l)); assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(1l)); assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-06"))); assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(2l)); assertThat(facet.getEntries().get(1).getDistinctCount(), equalTo(2l)); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); // time zone causes the dates to shift by 1:30 facet = searchResponse.getFacets().facet("stats2"); assertThat(facet.getName(), equalTo("stats2")); assertThat(facet.getEntries().size(), equalTo(2)); assertThat(facet.getEntries().get(0).getTime(), equalTo(utcTimeInMillis("2009-03-05"))); assertThat(facet.getEntries().get(0).getTotalCount(), equalTo(2l)); assertThat(facet.getEntries().get(0).getDistinctCount(), equalTo(2l)); assertThat(facet.getEntries().get(1).getTime(), equalTo(utcTimeInMillis("2009-03-06"))); assertThat(facet.getEntries().get(1).getTotalCount(), equalTo(1l)); assertThat(facet.getEntries().get(1).getDistinctCount(), equalTo(1l)); assertThat(facet.getTotalCount(), equalTo(3l)); assertThat(facet.getDistinctCount(), equalTo(3l)); } @Test public void testWithMaxOneDocPerDayBucketOnAtomicField() throws Exception { putSync(newID(), 1, __days[0]); putSync(newID(), 1, __days[2]); putSync(newID(), 1, __days[4]); putSync(newID(), 1, __days[6]); assertEquals(4, countAll()); final SearchResponse response = getHistogram(__days[0], __days[7], "day", __userField); assertEquals(4, response.getHits().getTotalHits()); final InternalDistinctFacet facet = response.getFacets().facet(__facetName); final List<DistinctTimePeriod<NullEntry>> facetList = facet.entries(); // Expecting just one hit and one distinct hit per doc, for the username. assertEquals(4, facetList.size()); assertEquals(__days[0], facetList.get(0).getTime()); assertEquals(1, facetList.get(0).getTotalCount()); assertEquals(1, facetList.get(0).getDistinctCount()); assertEquals(__days[2], facetList.get(1).getTime()); assertEquals(1, facetList.get(1).getTotalCount()); assertEquals(1, facetList.get(1).getDistinctCount()); assertEquals(__days[4], facetList.get(2).getTime()); assertEquals(1, facetList.get(2).getTotalCount()); assertEquals(1, facetList.get(2).getDistinctCount()); assertEquals(__days[6], facetList.get(3).getTime()); assertEquals(1, facetList.get(3).getTotalCount()); assertEquals(1, facetList.get(3).getDistinctCount()); assertThat(facet.getTotalCount(), equalTo(4l)); assertThat(facet.getDistinctCount(), equalTo(1l)); // same user each time } @Test public void testWithMaxOneDocPerDayBucketOnAnalysedField() throws Exception { putSync(newID(), 1, __days[0]); putSync(newID(), 1, __days[2]); putSync(newID(), 1, __days[4]); putSync(newID(), 1, __days[6]); assertEquals(4, countAll()); final SearchResponse response = getHistogram(__days[0], __days[7], "day", __txtField); assertEquals(4, response.getHits().getTotalHits()); final InternalDistinctFacet facet = response.getFacets().facet(__facetName); final List<DistinctTimePeriod<NullEntry>> facetList = facet.entries(); // Expecting one hit for each token in the string "Document created [at] <TIMESTAMP>" // for each document, in this case these are unique per bucket too. The word "at" // is a stopword and is removed. assertEquals(4, facetList.size()); assertEquals(__days[0], facetList.get(0).getTime()); assertEquals(3, facetList.get(0).getTotalCount()); assertEquals(3, facetList.get(0).getDistinctCount()); assertEquals(__days[2], facetList.get(1).getTime()); assertEquals(3, facetList.get(1).getTotalCount()); assertEquals(3, facetList.get(1).getDistinctCount()); assertEquals(__days[4], facetList.get(2).getTime()); assertEquals(3, facetList.get(2).getTotalCount()); assertEquals(3, facetList.get(2).getDistinctCount()); assertEquals(__days[6], facetList.get(3).getTime()); assertEquals(3, facetList.get(3).getTotalCount()); assertEquals(3, facetList.get(3).getDistinctCount()); assertThat(facet.getTotalCount(), equalTo(12l)); assertThat(facet.getDistinctCount(), equalTo(6l)); // "document", "created", 4 usernames } @Test public void testWithMultipleDocsPerDayBucketOnAtomicField() throws Exception { putSync(newID(), 1, __days[0]); putSync(newID(), 2, __days[0] + 10); putSync(newID(), 1, __days[0] + 20); putSync(newID(), 1, __days[2]); putSync(newID(), 1, __days[4]); putSync(newID(), 1, __days[6]); putSync(newID(), 3, __days[6] + 10); putSync(newID(), 4, __days[6] + 20); assertEquals(8, countAll()); final SearchResponse response = getHistogram(__days[0], __days[7], "day", __userField); assertEquals(8, response.getHits().getTotalHits()); final InternalDistinctFacet facet = response.getFacets().facet(__facetName); final List<DistinctTimePeriod<NullEntry>> facetList = facet.entries(); // Hits and distinct hits can now vary in intervals where the same user posted more // than once (i.e. day 0 here). assertEquals(4, facetList.size()); assertEquals(__days[0], facetList.get(0).getTime()); assertEquals(3, facetList.get(0).getTotalCount()); assertEquals(2, facetList.get(0).getDistinctCount()); assertEquals(__days[2], facetList.get(1).getTime()); assertEquals(1, facetList.get(1).getTotalCount()); assertEquals(1, facetList.get(1).getDistinctCount()); assertEquals(__days[4], facetList.get(2).getTime()); assertEquals(1, facetList.get(2).getTotalCount()); assertEquals(1, facetList.get(2).getDistinctCount()); assertEquals(__days[6], facetList.get(3).getTime()); assertEquals(3, facetList.get(3).getTotalCount()); assertEquals(3, facetList.get(3).getDistinctCount()); assertThat(facet.getTotalCount(), equalTo(8l)); assertThat(facet.getDistinctCount(), equalTo(4l)); // 4 different users } @Test public void testWithMultipleDocsPerDayBucketOnAnalysedField() throws Exception { putSync(newID(), 1, __days[0]); putSync(newID(), 2, __days[0] + 10); putSync(newID(), 1, __days[0] + 20); putSync(newID(), 1, __days[2]); putSync(newID(), 1, __days[4]); putSync(newID(), 1, __days[6]); putSync(newID(), 3, __days[6] + 10); putSync(newID(), 4, __days[6] + 20); assertEquals(8, countAll()); final SearchResponse response = getHistogram(__days[0], __days[7], "day", __txtField); assertEquals(8, response.getHits().getTotalHits()); final InternalDistinctFacet facet = response.getFacets().facet(__facetName); final List<DistinctTimePeriod<NullEntry>> facetList = facet.entries(); // Now things get a bit more complex as all the posts are identically worded apart // from the timestamp at the end. 3 tokens indexed per each instance of the field. assertEquals(4, facetList.size()); assertEquals(__days[0], facetList.get(0).getTime()); assertEquals(3 * 3, facetList.get(0).getTotalCount()); assertEquals(2 + (1 * 3), facetList.get(0).getDistinctCount()); assertEquals(__days[2], facetList.get(1).getTime()); assertEquals(1 * 3, facetList.get(1).getTotalCount()); assertEquals(1 * 3, facetList.get(1).getDistinctCount()); assertEquals(__days[4], facetList.get(2).getTime()); assertEquals(1 * 3, facetList.get(2).getTotalCount()); assertEquals(1 * 3, facetList.get(2).getDistinctCount()); assertEquals(__days[6], facetList.get(3).getTime()); assertEquals(3 * 3, facetList.get(3).getTotalCount()); assertEquals(2 + (1 * 3), facetList.get(3).getDistinctCount()); assertThat(facet.getTotalCount(), equalTo(24l)); assertThat(facet.getDistinctCount(), equalTo(10l)); // "document", "created", 8 usernames } @Test public void testRandomizedWithManyItemsOnDayBucket() throws Exception { for(int t = 1; t <= 20; t++) { setUp(); final int minPerDay = (int) pow(2, t); System.out.println("Randomized testing: inserting minimum " + 7 * minPerDay + " items"); final int[] itemsPerDay = prepareRandomData(minPerDay); final int totalItems = add(itemsPerDay); assertEquals(totalItems, countAll()); System.out.println("Randomized testing: running facet"); final SearchResponse response = getHistogram(__days[0], __days[7], "day", __userField, 1000); final InternalDistinctFacet facet1 = response.getFacets().facet(__facetName); final List<DistinctTimePeriod<NullEntry>> facetList1 = facet1.entries(); assertEquals(7, facetList1.size()); assertEquals(totalItems, facet1.getTotalCount()); int tolerance = totalItems / 100; int totalDistinct = totalItems; assertTrue(String.format( "With %d total distinct items: Estimated overall distinct count %d is not within 1%% tolerance of %d", totalDistinct, facet1.getDistinctCount(), totalDistinct), abs(totalDistinct - facet1.getDistinctCount()) <= tolerance); for(int i = 0; i < 7; i++) { final int exactUsers = itemsPerDay[i]; assertEquals(exactUsers, facetList1.get(i).getTotalCount()); tolerance = exactUsers / 100; final long fuzzyUsers = facetList1.get(i).getDistinctCount(); //System.out.println("Exact user count = " + exactUsers); //System.out.println("Fuzzy user count = " + fuzzyUsers); assertTrue(String.format( "With > %d terms per day: Estimated count %d is not within 1%% tolerance of %d", minPerDay, fuzzyUsers, exactUsers), abs(fuzzyUsers - exactUsers) <= tolerance); } final SearchResponse response2 = getHistogram(__days[0], __days[7], "day", __txtField, 1000); final InternalDistinctFacet facet2 = response2.getFacets().facet(__facetName); final List<DistinctTimePeriod<NullEntry>> facetList2 = facet2.entries(); assertEquals(7, facetList2.size()); assertEquals(3 * totalItems, facet2.getTotalCount()); tolerance = totalItems / 100; totalDistinct = 2 + totalItems; assertTrue(String.format( "With %d total distinct items: Estimated overall distinct count %d is not within 1%% tolerance of %d", totalDistinct, facet2.getDistinctCount(), totalDistinct), abs(totalDistinct - facet2.getDistinctCount()) <= tolerance); for(int i = 0; i < 7; i++) { final int exactTokens = itemsPerDay[i] * 3; // "Document created [by] <ID>" final int exactDistinctTokens = itemsPerDay[i] + 2; assertEquals(exactTokens, facetList2.get(i).getTotalCount()); tolerance = exactDistinctTokens / 100; final long fuzzyDistinctTokens = facetList2.get(i).getDistinctCount(); //System.out.println("Exact distinct token count = " + exactDistinctTokens); //System.out.println("Fuzzy distinct token count = " + fuzzyDistinctTokens); assertTrue(String.format( "With > %d terms per day: Estimated count %d is not within 1%% tolerance of %d", minPerDay, fuzzyDistinctTokens, exactDistinctTokens), abs(fuzzyDistinctTokens - exactDistinctTokens) <= tolerance); } } } // Helper methods private static int newID() { return __counter.getAndIncrement(); } private SearchResponse getHistogram(final long start, final long end, final String interval, final String distinctField) { return getHistogram(start, end, interval, distinctField, 0); } private SearchResponse getHistogram(final long start, final long end, final String interval, final String distinctField, final int exactThreshold) { final FilterBuilder range = FilterBuilders.numericRangeFilter(__tsField) .from(start) .to(end); final DateFacetBuilder facet = new DateFacetBuilder(__facetName) .keyField(__tsField) .distinctField(distinctField) .facetFilter(range) .exactThreshold(exactThreshold) .interval(interval); return client().prepareSearch(__index) .setSearchType(SearchType.COUNT) .addFacet(facet) .execute() .actionGet(); } private void putSync(final int id, final int user, final long timestamp) throws ElasticSearchException, IOException { final String stringID = String.valueOf(id); client().prepareIndex(__index, __type1, String.valueOf(stringID)) .setRefresh(true) .setRouting(stringID) .setSource(XContentFactory.jsonBuilder() .startObject() .field(__txtField, "Document created at " + timestamp) .field(__userField, user) .field(__tsField, timestamp) .endObject()).execute().actionGet(); } private void putBulk(final String[] ids, final int[] users, final long[] timestamps) throws Exception { final int batchSize = 5000; for(int i = 0; i < ids.length; i += batchSize) { final BulkRequestBuilder bulk = client().prepareBulk(); for(int j = 0; j < batchSize; j++) { final int idx = i + j; if(idx >= ids.length) { bulk.setRefresh(true).execute().actionGet(); return; } bulk.add(new IndexRequest(__index, __type1, ids[idx]) .routing(ids[idx]) .source(XContentFactory.jsonBuilder() .startObject() .field(__txtField, "Document created by " + users[idx]) .field(__userField, users[idx]) .field(__tsField, timestamps[idx]) .endObject())); } bulk.execute().actionGet(); } new RefreshRequestBuilder(client().admin().indices()).execute(); } private int[] prepareRandomData(final int minPerDay) throws Exception { final int[] itemsPerDay = new int[7]; final int variationPerDay = max(1, minPerDay / 10); for(int i = 0; i < 7; i++) { itemsPerDay[i] = minPerDay + _random.nextInt(variationPerDay); final int[] ids = new int[itemsPerDay[i]]; final String[] stringIDs = new String[itemsPerDay[i]]; final long[] timestamps = new long[itemsPerDay[i]]; for(int j = 0; j < itemsPerDay[i]; j++) { timestamps[j] = __days[i] + (60 * 1000 * (int) (random() * 1440)); ids[j] = newID(); stringIDs[j] = String.valueOf(ids[j]); } putBulk(stringIDs, ids, timestamps); } client().admin().indices().optimize( new OptimizeRequest().waitForMerge(true)); return itemsPerDay; } private int add(final int[] ints) { int total = 0; for(final int i : ints) { total += i; } return total; } private long countAll() { return client() .prepareCount("_all") .execute() .actionGet() .getCount(); } private long utcTimeInMillis(final String time) { return timeInMillis(time, DateTimeZone.UTC); } private long timeInMillis(final String time, final DateTimeZone zone) { return ISODateTimeFormat.dateOptionalTimeParser().withZone(zone).parseMillis(time); } private Client client() { return __node.client(); } }