/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.util.bkd; import java.io.IOException; import java.math.BigInteger; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.List; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.index.PointValues; import org.apache.lucene.store.CorruptingIndexOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.TestUtil; public class TestBKD extends LuceneTestCase { public void testBasicInts1D() throws Exception { try (Directory dir = getDirectory(100)) { BKDWriter w = new BKDWriter(100, dir, "tmp", 1, 4, 2, 1.0f, 100, true); byte[] scratch = new byte[4]; for(int docID=0;docID<100;docID++) { NumericUtils.intToSortableBytes(docID, scratch, 0); w.add(scratch, docID); } long indexFP; try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { indexFP = w.finish(out); } try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { in.seek(indexFP); BKDReader r = new BKDReader(in); // Simple 1D range query: final int queryMin = 42; final int queryMax = 87; final BitSet hits = new BitSet(); r.intersect(new IntersectVisitor() { @Override public void visit(int docID) { hits.set(docID); if (VERBOSE) { System.out.println("visit docID=" + docID); } } @Override public void visit(int docID, byte[] packedValue) { int x = NumericUtils.sortableBytesToInt(packedValue, 0); if (VERBOSE) { System.out.println("visit docID=" + docID + " x=" + x); } if (x >= queryMin && x <= queryMax) { hits.set(docID); } } @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { int min = NumericUtils.sortableBytesToInt(minPacked, 0); int max = NumericUtils.sortableBytesToInt(maxPacked, 0); assert max >= min; if (VERBOSE) { System.out.println("compare: min=" + min + " max=" + max + " vs queryMin=" + queryMin + " queryMax=" + queryMax); } if (max < queryMin || min > queryMax) { return Relation.CELL_OUTSIDE_QUERY; } else if (min >= queryMin && max <= queryMax) { return Relation.CELL_INSIDE_QUERY; } else { return Relation.CELL_CROSSES_QUERY; } } }); for(int docID=0;docID<100;docID++) { boolean expected = docID >= queryMin && docID <= queryMax; boolean actual = hits.get(docID); assertEquals("docID=" + docID, expected, actual); } } } } public void testRandomIntsNDims() throws Exception { int numDocs = atLeast(1000); try (Directory dir = getDirectory(numDocs)) { int numDims = TestUtil.nextInt(random(), 1, 5); int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100); float maxMB = (float) 3.0 + (3*random().nextFloat()); BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, 4, maxPointsInLeafNode, maxMB, numDocs, true); if (VERBOSE) { System.out.println("TEST: numDims=" + numDims + " numDocs=" + numDocs); } int[][] docs = new int[numDocs][]; byte[] scratch = new byte[4*numDims]; int[] minValue = new int[numDims]; int[] maxValue = new int[numDims]; Arrays.fill(minValue, Integer.MAX_VALUE); Arrays.fill(maxValue, Integer.MIN_VALUE); for(int docID=0;docID<numDocs;docID++) { int[] values = new int[numDims]; if (VERBOSE) { System.out.println(" docID=" + docID); } for(int dim=0;dim<numDims;dim++) { values[dim] = random().nextInt(); if (values[dim] < minValue[dim]) { minValue[dim] = values[dim]; } if (values[dim] > maxValue[dim]) { maxValue[dim] = values[dim]; } NumericUtils.intToSortableBytes(values[dim], scratch, dim * Integer.BYTES); if (VERBOSE) { System.out.println(" " + dim + " -> " + values[dim]); } } docs[docID] = values; w.add(scratch, docID); } long indexFP; try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { indexFP = w.finish(out); } try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { in.seek(indexFP); BKDReader r = new BKDReader(in); byte[] minPackedValue = r.getMinPackedValue(); byte[] maxPackedValue = r.getMaxPackedValue(); for(int dim=0;dim<numDims;dim++) { assertEquals(minValue[dim], NumericUtils.sortableBytesToInt(minPackedValue, dim * Integer.BYTES)); assertEquals(maxValue[dim], NumericUtils.sortableBytesToInt(maxPackedValue, dim * Integer.BYTES)); } int iters = atLeast(100); for(int iter=0;iter<iters;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Random N dims rect query: int[] queryMin = new int[numDims]; int[] queryMax = new int[numDims]; for(int dim=0;dim<numDims;dim++) { queryMin[dim] = random().nextInt(); queryMax[dim] = random().nextInt(); if (queryMin[dim] > queryMax[dim]) { int x = queryMin[dim]; queryMin[dim] = queryMax[dim]; queryMax[dim] = x; } } final BitSet hits = new BitSet(); r.intersect(new IntersectVisitor() { @Override public void visit(int docID) { hits.set(docID); //System.out.println("visit docID=" + docID); } @Override public void visit(int docID, byte[] packedValue) { //System.out.println("visit check docID=" + docID); for(int dim=0;dim<numDims;dim++) { int x = NumericUtils.sortableBytesToInt(packedValue, dim * Integer.BYTES); if (x < queryMin[dim] || x > queryMax[dim]) { //System.out.println(" no"); return; } } //System.out.println(" yes"); hits.set(docID); } @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { boolean crosses = false; for(int dim=0;dim<numDims;dim++) { int min = NumericUtils.sortableBytesToInt(minPacked, dim * Integer.BYTES); int max = NumericUtils.sortableBytesToInt(maxPacked, dim * Integer.BYTES); assert max >= min; if (max < queryMin[dim] || min > queryMax[dim]) { return Relation.CELL_OUTSIDE_QUERY; } else if (min < queryMin[dim] || max > queryMax[dim]) { crosses = true; } } if (crosses) { return Relation.CELL_CROSSES_QUERY; } else { return Relation.CELL_INSIDE_QUERY; } } }); for(int docID=0;docID<numDocs;docID++) { int[] docValues = docs[docID]; boolean expected = true; for(int dim=0;dim<numDims;dim++) { int x = docValues[dim]; if (x < queryMin[dim] || x > queryMax[dim]) { expected = false; break; } } boolean actual = hits.get(docID); assertEquals("docID=" + docID, expected, actual); } } } } } // Tests on N-dimensional points where each dimension is a BigInteger public void testBigIntNDims() throws Exception { int numDocs = atLeast(1000); try (Directory dir = getDirectory(numDocs)) { int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 1, 5); int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100); float maxMB = (float) 3.0 + (3*random().nextFloat()); BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, numDocs, true); BigInteger[][] docs = new BigInteger[numDocs][]; byte[] scratch = new byte[numBytesPerDim*numDims]; for(int docID=0;docID<numDocs;docID++) { BigInteger[] values = new BigInteger[numDims]; if (VERBOSE) { System.out.println(" docID=" + docID); } for(int dim=0;dim<numDims;dim++) { values[dim] = randomBigInt(numBytesPerDim); NumericUtils.bigIntToSortableBytes(values[dim], numBytesPerDim, scratch, dim * numBytesPerDim); if (VERBOSE) { System.out.println(" " + dim + " -> " + values[dim]); } } docs[docID] = values; w.add(scratch, docID); } long indexFP; try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { indexFP = w.finish(out); } try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { in.seek(indexFP); BKDReader r = new BKDReader(in); int iters = atLeast(100); for(int iter=0;iter<iters;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Random N dims rect query: BigInteger[] queryMin = new BigInteger[numDims]; BigInteger[] queryMax = new BigInteger[numDims]; for(int dim=0;dim<numDims;dim++) { queryMin[dim] = randomBigInt(numBytesPerDim); queryMax[dim] = randomBigInt(numBytesPerDim); if (queryMin[dim].compareTo(queryMax[dim]) > 0) { BigInteger x = queryMin[dim]; queryMin[dim] = queryMax[dim]; queryMax[dim] = x; } } final BitSet hits = new BitSet(); r.intersect(new IntersectVisitor() { @Override public void visit(int docID) { hits.set(docID); //System.out.println("visit docID=" + docID); } @Override public void visit(int docID, byte[] packedValue) { //System.out.println("visit check docID=" + docID); for(int dim=0;dim<numDims;dim++) { BigInteger x = NumericUtils.sortableBytesToBigInt(packedValue, dim * numBytesPerDim, numBytesPerDim); if (x.compareTo(queryMin[dim]) < 0 || x.compareTo(queryMax[dim]) > 0) { //System.out.println(" no"); return; } } //System.out.println(" yes"); hits.set(docID); } @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { boolean crosses = false; for(int dim=0;dim<numDims;dim++) { BigInteger min = NumericUtils.sortableBytesToBigInt(minPacked, dim * numBytesPerDim, numBytesPerDim); BigInteger max = NumericUtils.sortableBytesToBigInt(maxPacked, dim * numBytesPerDim, numBytesPerDim); assert max.compareTo(min) >= 0; if (max.compareTo(queryMin[dim]) < 0 || min.compareTo(queryMax[dim]) > 0) { return Relation.CELL_OUTSIDE_QUERY; } else if (min.compareTo(queryMin[dim]) < 0 || max.compareTo(queryMax[dim]) > 0) { crosses = true; } } if (crosses) { return Relation.CELL_CROSSES_QUERY; } else { return Relation.CELL_INSIDE_QUERY; } } }); for(int docID=0;docID<numDocs;docID++) { BigInteger[] docValues = docs[docID]; boolean expected = true; for(int dim=0;dim<numDims;dim++) { BigInteger x = docValues[dim]; if (x.compareTo(queryMin[dim]) < 0 || x.compareTo(queryMax[dim]) > 0) { expected = false; break; } } boolean actual = hits.get(docID); assertEquals("docID=" + docID, expected, actual); } } } } } /** Make sure we close open files, delete temp files, etc., on exception */ public void testWithExceptions() throws Exception { int numDocs = atLeast(10000); int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 1, 5); byte[][][] docValues = new byte[numDocs][][]; for(int docID=0;docID<numDocs;docID++) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { values[dim] = new byte[numBytesPerDim]; random().nextBytes(values[dim]); } docValues[docID] = values; } double maxMBHeap = 0.05; // Keep retrying until we 1) we allow a big enough heap, and 2) we hit a random IOExc from MDW: boolean done = false; while (done == false) { MockDirectoryWrapper dir = newMockFSDirectory(createTempDir()); try { dir.setRandomIOExceptionRate(0.05); dir.setRandomIOExceptionRateOnOpen(0.05); verify(dir, docValues, null, numDims, numBytesPerDim, 50, maxMBHeap); } catch (IllegalArgumentException iae) { // This just means we got a too-small maxMB for the maxPointsInLeafNode; just retry w/ more heap assertTrue(iae.getMessage().contains("either increase maxMBSortInHeap or decrease maxPointsInLeafNode")); maxMBHeap *= 1.25; } catch (IOException ioe) { if (ioe.getMessage().contains("a random IOException")) { // BKDWriter should fully clean up after itself: done = true; } else { throw ioe; } } String[] files = dir.listAll(); assertTrue("files=" + Arrays.toString(files), files.length == 0 || Arrays.equals(files, new String[] {"extra0"})); dir.close(); } } public void testRandomBinaryTiny() throws Exception { doTestRandomBinary(10); } public void testRandomBinaryMedium() throws Exception { doTestRandomBinary(10000); } @Nightly public void testRandomBinaryBig() throws Exception { doTestRandomBinary(200000); } public void testTooLittleHeap() throws Exception { try (Directory dir = getDirectory(0)) { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { new BKDWriter(1, dir, "bkd", 1, 16, 1000000, 0.001, 0, true); }); assertTrue(expected.getMessage().contains("either increase maxMBSortInHeap or decrease maxPointsInLeafNode")); } } private void doTestRandomBinary(int count) throws Exception { int numDocs = TestUtil.nextInt(random(), count, count*2); int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 1, 5); byte[][][] docValues = new byte[numDocs][][]; for(int docID=0;docID<numDocs;docID++) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { values[dim] = new byte[numBytesPerDim]; random().nextBytes(values[dim]); } docValues[docID] = values; } verify(docValues, null, numDims, numBytesPerDim); } public void testAllEqual() throws Exception { int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 1, 5); int numDocs = atLeast(1000); byte[][][] docValues = new byte[numDocs][][]; for(int docID=0;docID<numDocs;docID++) { if (docID == 0) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { values[dim] = new byte[numBytesPerDim]; random().nextBytes(values[dim]); } docValues[docID] = values; } else { docValues[docID] = docValues[0]; } } verify(docValues, null, numDims, numBytesPerDim); } public void testOneDimEqual() throws Exception { int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 1, 5); int numDocs = atLeast(1000); int theEqualDim = random().nextInt(numDims); byte[][][] docValues = new byte[numDocs][][]; for(int docID=0;docID<numDocs;docID++) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { values[dim] = new byte[numBytesPerDim]; random().nextBytes(values[dim]); } docValues[docID] = values; if (docID > 0) { docValues[docID][theEqualDim] = docValues[0][theEqualDim]; } } // Use a small number of points in leaf blocks to trigger a lot of splitting verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50)); } // This triggers the logic that makes sure all dimensions get indexed // by looking at how many times each dim has been split public void testOneDimLowCard() throws Exception { int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 2, 5); int numDocs = atLeast(10000); int theLowCardDim = random().nextInt(numDims); byte[] value1 = new byte[numBytesPerDim]; random().nextBytes(value1); byte[] value2 = value1.clone(); if (value2[numBytesPerDim-1] == 0 || random().nextBoolean()) { value2[numBytesPerDim-1]++; } else { value2[numBytesPerDim-1]--; } byte[][][] docValues = new byte[numDocs][][]; for(int docID=0;docID<numDocs;docID++) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { if (dim == theLowCardDim) { values[dim] = random().nextBoolean() ? value1 : value2; } else { values[dim] = new byte[numBytesPerDim]; random().nextBytes(values[dim]); } } docValues[docID] = values; } // Use a small number of points in leaf blocks to trigger a lot of splitting verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50)); } // this should trigger run-length compression with lengths that are greater than 255 public void testOneDimTwoValues() throws Exception { int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 1, 5); int numDocs = atLeast(1000); int theDim = random().nextInt(numDims); byte[] value1 = new byte[numBytesPerDim]; random().nextBytes(value1); byte[] value2 = new byte[numBytesPerDim]; random().nextBytes(value2); byte[][][] docValues = new byte[numDocs][][]; for(int docID=0;docID<numDocs;docID++) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { if (dim == theDim) { values[dim] = random().nextBoolean() ? value1 : value2; } else { values[dim] = new byte[numBytesPerDim]; random().nextBytes(values[dim]); } } docValues[docID] = values; } verify(docValues, null, numDims, numBytesPerDim); } public void testMultiValued() throws Exception { int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); int numDims = TestUtil.nextInt(random(), 1, 5); int numDocs = atLeast(1000); List<byte[][]> docValues = new ArrayList<>(); List<Integer> docIDs = new ArrayList<>(); for(int docID=0;docID<numDocs;docID++) { int numValuesInDoc = TestUtil.nextInt(random(), 1, 5); for(int ord=0;ord<numValuesInDoc;ord++) { docIDs.add(docID); byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { values[dim] = new byte[numBytesPerDim]; random().nextBytes(values[dim]); } docValues.add(values); } } byte[][][] docValuesArray = docValues.toArray(new byte[docValues.size()][][]); int[] docIDsArray = new int[docIDs.size()]; for(int i=0;i<docIDsArray.length;i++) { docIDsArray[i] = docIDs.get(i); } verify(docValuesArray, docIDsArray, numDims, numBytesPerDim); } /** docIDs can be null, for the single valued case, else it maps value to docID */ private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim) throws Exception { verify(docValues, docIDs, numDims, numBytesPerDim, TestUtil.nextInt(random(), 50, 1000)); } private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim, int maxPointsInLeafNode) throws Exception { try (Directory dir = getDirectory(docValues.length)) { double maxMB = (float) 3.0 + (3*random().nextDouble()); verify(dir, docValues, docIDs, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB); } } private void verify(Directory dir, byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim, int maxPointsInLeafNode, double maxMB) throws Exception { int numValues = docValues.length; if (VERBOSE) { System.out.println("TEST: numValues=" + numValues + " numDims=" + numDims + " numBytesPerDim=" + numBytesPerDim + " maxPointsInLeafNode=" + maxPointsInLeafNode + " maxMB=" + maxMB); } List<Long> toMerge = null; List<MergeState.DocMap> docMaps = null; int seg = 0; BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false); IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT); IndexInput in = null; boolean success = false; try { byte[] scratch = new byte[numBytesPerDim*numDims]; int lastDocIDBase = 0; boolean useMerge = numDims == 1 && numValues >= 10 && random().nextBoolean(); int valuesInThisSeg; if (useMerge) { // Sometimes we will call merge with a single segment: valuesInThisSeg = TestUtil.nextInt(random(), numValues/10, numValues); } else { valuesInThisSeg = 0; } int segCount = 0; for(int ord=0;ord<numValues;ord++) { int docID; if (docIDs == null) { docID = ord; } else { docID = docIDs[ord]; } if (VERBOSE) { System.out.println(" ord=" + ord + " docID=" + docID + " lastDocIDBase=" + lastDocIDBase); } for(int dim=0;dim<numDims;dim++) { if (VERBOSE) { System.out.println(" " + dim + " -> " + new BytesRef(docValues[ord][dim])); } System.arraycopy(docValues[ord][dim], 0, scratch, dim*numBytesPerDim, numBytesPerDim); } w.add(scratch, docID-lastDocIDBase); segCount++; if (useMerge && segCount == valuesInThisSeg) { if (toMerge == null) { toMerge = new ArrayList<>(); docMaps = new ArrayList<>(); } final int curDocIDBase = lastDocIDBase; docMaps.add(new MergeState.DocMap() { @Override public int get(int docID) { return curDocIDBase + docID; } }); toMerge.add(w.finish(out)); valuesInThisSeg = TestUtil.nextInt(random(), numValues/10, numValues/2); segCount = 0; seg++; maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000); maxMB = (float) 3.0 + (3*random().nextDouble()); w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false); lastDocIDBase = docID; } } long indexFP; if (toMerge != null) { if (segCount > 0) { toMerge.add(w.finish(out)); final int curDocIDBase = lastDocIDBase; docMaps.add(new MergeState.DocMap() { @Override public int get(int docID) { return curDocIDBase + docID; } }); } out.close(); in = dir.openInput("bkd", IOContext.DEFAULT); seg++; w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false); List<BKDReader> readers = new ArrayList<>(); for(long fp : toMerge) { in.seek(fp); readers.add(new BKDReader(in)); } out = dir.createOutput("bkd2", IOContext.DEFAULT); indexFP = w.merge(out, docMaps, readers); out.close(); in.close(); in = dir.openInput("bkd2", IOContext.DEFAULT); } else { indexFP = w.finish(out); out.close(); in = dir.openInput("bkd", IOContext.DEFAULT); } in.seek(indexFP); BKDReader r = new BKDReader(in); int iters = atLeast(100); for(int iter=0;iter<iters;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Random N dims rect query: byte[][] queryMin = new byte[numDims][]; byte[][] queryMax = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { queryMin[dim] = new byte[numBytesPerDim]; random().nextBytes(queryMin[dim]); queryMax[dim] = new byte[numBytesPerDim]; random().nextBytes(queryMax[dim]); if (StringHelper.compare(numBytesPerDim, queryMin[dim], 0, queryMax[dim], 0) > 0) { byte[] x = queryMin[dim]; queryMin[dim] = queryMax[dim]; queryMax[dim] = x; } } final BitSet hits = new BitSet(); r.intersect(new IntersectVisitor() { @Override public void visit(int docID) { hits.set(docID); //System.out.println("visit docID=" + docID); } @Override public void visit(int docID, byte[] packedValue) { //System.out.println("visit check docID=" + docID); for(int dim=0;dim<numDims;dim++) { if (StringHelper.compare(numBytesPerDim, packedValue, dim*numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, packedValue, dim*numBytesPerDim, queryMax[dim], 0) > 0) { //System.out.println(" no"); return; } } //System.out.println(" yes"); hits.set(docID); } @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { boolean crosses = false; for(int dim=0;dim<numDims;dim++) { if (StringHelper.compare(numBytesPerDim, maxPacked, dim*numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, minPacked, dim*numBytesPerDim, queryMax[dim], 0) > 0) { return Relation.CELL_OUTSIDE_QUERY; } else if (StringHelper.compare(numBytesPerDim, minPacked, dim*numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, maxPacked, dim*numBytesPerDim, queryMax[dim], 0) > 0) { crosses = true; } } if (crosses) { return Relation.CELL_CROSSES_QUERY; } else { return Relation.CELL_INSIDE_QUERY; } } }); BitSet expected = new BitSet(); for(int ord=0;ord<numValues;ord++) { boolean matches = true; for(int dim=0;dim<numDims;dim++) { byte[] x = docValues[ord][dim]; if (StringHelper.compare(numBytesPerDim, x, 0, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, x, 0, queryMax[dim], 0) > 0) { matches = false; break; } } if (matches) { int docID; if (docIDs == null) { docID = ord; } else { docID = docIDs[ord]; } expected.set(docID); } } int limit = Math.max(expected.length(), hits.length()); for(int docID=0;docID<limit;docID++) { assertEquals("docID=" + docID, expected.get(docID), hits.get(docID)); } } in.close(); dir.deleteFile("bkd"); if (toMerge != null) { dir.deleteFile("bkd2"); } success = true; } finally { if (success == false) { IOUtils.closeWhileHandlingException(w, in, out); IOUtils.deleteFilesIgnoringExceptions(dir, "bkd", "bkd2"); } } } private BigInteger randomBigInt(int numBytes) { BigInteger x = new BigInteger(numBytes*8-1, random()); if (random().nextBoolean()) { x = x.negate(); } return x; } private Directory getDirectory(int numPoints) { Directory dir; if (numPoints > 100000) { dir = newFSDirectory(createTempDir("TestBKDTree")); } else { dir = newDirectory(); } return dir; } /** Make sure corruption on an input sort file is caught, even if BKDWriter doesn't get angry */ public void testBitFlippedOnPartition1() throws Exception { // Generate fixed data set: int numDocs = atLeast(10000); int numBytesPerDim = 4; int numDims = 3; byte[][][] docValues = new byte[numDocs][][]; byte counter = 0; for(int docID=0;docID<numDocs;docID++) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { values[dim] = new byte[numBytesPerDim]; for(int i=0;i<values[dim].length;i++) { values[dim][i] = counter; counter++; } } docValues[docID] = values; } try (Directory dir0 = newMockDirectory()) { Directory dir = new FilterDirectory(dir0) { boolean corrupted; @Override public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) throws IOException { IndexOutput out = in.createTempOutput(prefix, suffix, context); if (corrupted == false && prefix.equals("_0_bkd1") && suffix.equals("sort")) { corrupted = true; return new CorruptingIndexOutput(dir0, 22, out); } else { return out; } } }; CorruptIndexException e = expectThrows(CorruptIndexException.class, () -> { verify(dir, docValues, null, numDims, numBytesPerDim, 50, 0.1); }); assertTrue(e.getMessage().contains("checksum failed (hardware problem?)")); } } /** Make sure corruption on a recursed partition is caught, when BKDWriter does get angry */ public void testBitFlippedOnPartition2() throws Exception { // Generate fixed data set: int numDocs = atLeast(10000); int numBytesPerDim = 4; int numDims = 3; byte[][][] docValues = new byte[numDocs][][]; byte counter = 0; for(int docID=0;docID<numDocs;docID++) { byte[][] values = new byte[numDims][]; for(int dim=0;dim<numDims;dim++) { values[dim] = new byte[numBytesPerDim]; for(int i=0;i<values[dim].length;i++) { values[dim][i] = counter; counter++; } } docValues[docID] = values; } try (Directory dir0 = newMockDirectory()) { Directory dir = new FilterDirectory(dir0) { boolean corrupted; @Override public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) throws IOException { IndexOutput out = in.createTempOutput(prefix, suffix, context); //System.out.println("prefix=" + prefix + " suffix=" + suffix); if (corrupted == false && suffix.equals("bkd_left1")) { //System.out.println("now corrupt byte=" + x + " prefix=" + prefix + " suffix=" + suffix); corrupted = true; return new CorruptingIndexOutput(dir0, 22072, out); } else { return out; } } }; Throwable t = expectThrows(CorruptIndexException.class, () -> { verify(dir, docValues, null, numDims, numBytesPerDim, 50, 0.1); }); assertCorruptionDetected(t); } } private void assertCorruptionDetected(Throwable t) { if (t instanceof CorruptIndexException) { if (t.getMessage().contains("checksum failed (hardware problem?)")) { return; } } for(Throwable suppressed : t.getSuppressed()) { if (suppressed instanceof CorruptIndexException) { if (suppressed.getMessage().contains("checksum failed (hardware problem?)")) { return; } } } fail("did not see a suppressed CorruptIndexException"); } public void testTieBreakOrder() throws Exception { try (Directory dir = newDirectory()) { int numDocs = 10000; BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 1, Integer.BYTES, 2, 0.01f, numDocs, true); for(int i=0;i<numDocs;i++) { w.add(new byte[Integer.BYTES], i); } IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT); long fp = w.finish(out); out.close(); IndexInput in = dir.openInput("bkd", IOContext.DEFAULT); in.seek(fp); BKDReader r = new BKDReader(in); r.intersect(new IntersectVisitor() { int lastDocID = -1; @Override public void visit(int docID) { assertTrue("lastDocID=" + lastDocID + " docID=" + docID, docID > lastDocID); lastDocID = docID; } @Override public void visit(int docID, byte[] packedValue) { visit(docID); } @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { return Relation.CELL_CROSSES_QUERY; } }); in.close(); } } public void test2DLongOrdsOffline() throws Exception { try (Directory dir = newDirectory()) { int numDocs = 100000; boolean singleValuePerDoc = false; boolean longOrds = true; int offlineSorterMaxTempFiles = TestUtil.nextInt(random(), 2, 20); BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", 2, Integer.BYTES, 2, 0.01f, numDocs, singleValuePerDoc, longOrds, 1, offlineSorterMaxTempFiles); byte[] buffer = new byte[2*Integer.BYTES]; for(int i=0;i<numDocs;i++) { random().nextBytes(buffer); w.add(buffer, i); } IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT); long fp = w.finish(out); out.close(); IndexInput in = dir.openInput("bkd", IOContext.DEFAULT); in.seek(fp); BKDReader r = new BKDReader(in); int[] count = new int[1]; r.intersect(new IntersectVisitor() { @Override public void visit(int docID) { count[0]++; } @Override public void visit(int docID, byte[] packedValue) { visit(docID); } @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { if (random().nextInt(7) == 1) { return Relation.CELL_CROSSES_QUERY; } else { return Relation.CELL_INSIDE_QUERY; } } }); assertEquals(numDocs, count[0]); in.close(); } } // Claims 16 bytes per dim, but only use the bottom N 1-3 bytes; this would happen e.g. if a user indexes what are actually just short // values as a LongPoint: public void testWastedLeadingBytes() throws Exception { int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS); int bytesPerDim = PointValues.MAX_NUM_BYTES; int bytesUsed = TestUtil.nextInt(random(), 1, 3); Directory dir = newFSDirectory(createTempDir()); int numDocs = 100000; BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", numDims, bytesPerDim, 32, 1f, numDocs, true); byte[] tmp = new byte[bytesUsed]; byte[] buffer = new byte[numDims * bytesPerDim]; for(int i=0;i<numDocs;i++) { for(int dim=0;dim<numDims;dim++) { random().nextBytes(tmp); System.arraycopy(tmp, 0, buffer, dim*bytesPerDim+(bytesPerDim-bytesUsed), tmp.length); } w.add(buffer, i); } IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT); long fp = w.finish(out); out.close(); IndexInput in = dir.openInput("bkd", IOContext.DEFAULT); in.seek(fp); BKDReader r = new BKDReader(in); int[] count = new int[1]; r.intersect(new IntersectVisitor() { @Override public void visit(int docID) { count[0]++; } @Override public void visit(int docID, byte[] packedValue) { visit(docID); } @Override public Relation compare(byte[] minPacked, byte[] maxPacked) { if (random().nextInt(7) == 1) { return Relation.CELL_CROSSES_QUERY; } else { return Relation.CELL_INSIDE_QUERY; } } }); assertEquals(numDocs, count[0]); in.close(); dir.close(); } public void testEstimatePointCount() throws IOException { Directory dir = newDirectory(); final int numValues = atLeast(10000); // make sure to have multiple leaves final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4); final byte[] pointValue = new byte[numBytesPerDim]; final byte[] uniquePointValue = new byte[numBytesPerDim]; random().nextBytes(uniquePointValue); BKDWriter w = new BKDWriter(numValues, dir, "_temp", 1, numBytesPerDim, maxPointsInLeafNode, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, numValues, true); for (int i = 0; i < numValues; ++i) { if (i == numValues / 2) { w.add(uniquePointValue, i); } else { do { random().nextBytes(pointValue); } while (Arrays.equals(pointValue, uniquePointValue)); w.add(pointValue, i); } } final long indexFP; try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { indexFP = w.finish(out); w.close(); } IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT); pointsIn.seek(indexFP); BKDReader points = new BKDReader(pointsIn); int actualMaxPointsInLeafNode = numValues; while (actualMaxPointsInLeafNode > maxPointsInLeafNode) { actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2; } // If all points match, then the point count is numLeaves * maxPointsInLeafNode final int numLeaves = Integer.highestOneBit((numValues - 1) / actualMaxPointsInLeafNode) << 1; assertEquals(numLeaves * actualMaxPointsInLeafNode, points.estimatePointCount(new IntersectVisitor() { @Override public void visit(int docID, byte[] packedValue) throws IOException {} @Override public void visit(int docID) throws IOException {} @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { return Relation.CELL_INSIDE_QUERY; } })); // Return 0 if no points match assertEquals(0, points.estimatePointCount(new IntersectVisitor() { @Override public void visit(int docID, byte[] packedValue) throws IOException {} @Override public void visit(int docID) throws IOException {} @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { return Relation.CELL_OUTSIDE_QUERY; } })); // If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2 // in general, or maybe 2x that if the point is a split value final long pointCount = points.estimatePointCount(new IntersectVisitor() { @Override public void visit(int docID, byte[] packedValue) throws IOException {} @Override public void visit(int docID) throws IOException {} @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { if (StringHelper.compare(numBytesPerDim, uniquePointValue, 0, maxPackedValue, 0) > 0 || StringHelper.compare(numBytesPerDim, uniquePointValue, 0, minPackedValue, 0) < 0) { return Relation.CELL_OUTSIDE_QUERY; } return Relation.CELL_CROSSES_QUERY; } }); assertTrue(""+pointCount, pointCount == (actualMaxPointsInLeafNode + 1) / 2 || // common case pointCount == 2*((actualMaxPointsInLeafNode + 1) / 2)); // if the point is a split value pointsIn.close(); dir.close(); } }