package org.apache.lucene.index; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.NoSuchElementException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.Counter; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.AppendingPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; /** Buffers up pending byte[]s per doc, deref and sorting via * int ord, then flushes when segment flushes. */ class SortedSetDocValuesWriter extends DocValuesWriter { final BytesRefHash hash; private AppendingPackedLongBuffer pending; // stream of all termIDs private AppendingDeltaPackedLongBuffer pendingCounts; // termIDs per doc private final Counter iwBytesUsed; private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts' private final FieldInfo fieldInfo; private int currentDoc; private int currentValues[] = new int[8]; private int currentUpto = 0; private int maxCount = 0; public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; hash = new BytesRefHash( new ByteBlockPool( new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed)); pending = new AppendingPackedLongBuffer(PackedInts.COMPACT); pendingCounts = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); bytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed(); iwBytesUsed.addAndGet(bytesUsed); } public void addValue(int docID, BytesRef value) { if (value == null) { throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": null value not allowed"); } if (value.length > (BYTE_BLOCK_SIZE - 2)) { throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + (BYTE_BLOCK_SIZE - 2)); } if (docID != currentDoc) { finishCurrentDoc(); } // Fill in any holes: while(currentDoc < docID) { pendingCounts.add(0); // no values currentDoc++; } addOneValue(value); updateBytesUsed(); } // finalize currentDoc: this deduplicates the current term ids private void finishCurrentDoc() { Arrays.sort(currentValues, 0, currentUpto); int lastValue = -1; int count = 0; for (int i = 0; i < currentUpto; i++) { int termID = currentValues[i]; // if its not a duplicate if (termID != lastValue) { pending.add(termID); // record the term id count++; } lastValue = termID; } // record the number of unique term ids for this doc pendingCounts.add(count); maxCount = Math.max(maxCount, count); currentUpto = 0; currentDoc++; } @Override public void finish(int maxDoc) { finishCurrentDoc(); // fill in any holes for (int i = currentDoc; i < maxDoc; i++) { pendingCounts.add(0); // no values } } private void addOneValue(BytesRef value) { int termID = hash.add(value); if (termID < 0) { termID = -termID-1; } else { // reserve additional space for each unique value: // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints. // TODO: can this same OOM happen in THPF? // 2. when flushing, we need 1 int per value (slot in the ordMap). iwBytesUsed.addAndGet(2 * RamUsageEstimator.NUM_BYTES_INT); } if (currentUpto == currentValues.length) { currentValues = ArrayUtil.grow(currentValues, currentValues.length+1); // reserve additional space for max # values per-doc // when flushing, we need an int[] to sort the mapped-ords within the doc iwBytesUsed.addAndGet((currentValues.length - currentUpto) * 2 * RamUsageEstimator.NUM_BYTES_INT); } currentValues[currentUpto] = termID; currentUpto++; } private void updateBytesUsed() { final long newBytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed(); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); bytesUsed = newBytesUsed; } @Override public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException { final int maxDoc = state.segmentInfo.getDocCount(); final int maxCountPerDoc = maxCount; assert pendingCounts.size() == maxDoc; final int valueCount = hash.size(); final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); final int[] ordMap = new int[valueCount]; for(int ord=0;ord<valueCount;ord++) { ordMap[sortedValues[ord]] = ord; } dvConsumer.addSortedSetField(fieldInfo, // ord -> value new Iterable<BytesRef>() { @Override public Iterator<BytesRef> iterator() { return new ValuesIterator(sortedValues, valueCount); } }, // doc -> ordCount new Iterable<Number>() { @Override public Iterator<Number> iterator() { return new OrdCountIterator(maxDoc); } }, // ords new Iterable<Number>() { @Override public Iterator<Number> iterator() { return new OrdsIterator(ordMap, maxCountPerDoc); } }); } @Override public void abort() { } // iterates over the unique values we have in ram private class ValuesIterator implements Iterator<BytesRef> { final int sortedValues[]; final BytesRef scratch = new BytesRef(); final int valueCount; int ordUpto; ValuesIterator(int sortedValues[], int valueCount) { this.sortedValues = sortedValues; this.valueCount = valueCount; } @Override public boolean hasNext() { return ordUpto < valueCount; } @Override public BytesRef next() { if (!hasNext()) { throw new NoSuchElementException(); } hash.get(sortedValues[ordUpto], scratch); ordUpto++; return scratch; } @Override public void remove() { throw new UnsupportedOperationException(); } } // iterates over the ords for each doc we have in ram private class OrdsIterator implements Iterator<Number> { final AppendingPackedLongBuffer.Iterator iter = pending.iterator(); final AppendingDeltaPackedLongBuffer.Iterator counts = pendingCounts.iterator(); final int ordMap[]; final long numOrds; long ordUpto; final int currentDoc[]; int currentUpto; int currentLength; OrdsIterator(int ordMap[], int maxCount) { this.currentDoc = new int[maxCount]; this.ordMap = ordMap; this.numOrds = pending.size(); } @Override public boolean hasNext() { return ordUpto < numOrds; } @Override public Number next() { if (!hasNext()) { throw new NoSuchElementException(); } while (currentUpto == currentLength) { // refill next doc, and sort remapped ords within the doc. currentUpto = 0; currentLength = (int) counts.next(); for (int i = 0; i < currentLength; i++) { currentDoc[i] = ordMap[(int) iter.next()]; } Arrays.sort(currentDoc, 0, currentLength); } int ord = currentDoc[currentUpto]; currentUpto++; ordUpto++; // TODO: make reusable Number return ord; } @Override public void remove() { throw new UnsupportedOperationException(); } } private class OrdCountIterator implements Iterator<Number> { final AppendingDeltaPackedLongBuffer.Iterator iter = pendingCounts.iterator(); final int maxDoc; int docUpto; OrdCountIterator(int maxDoc) { this.maxDoc = maxDoc; assert pendingCounts.size() == maxDoc; } @Override public boolean hasNext() { return docUpto < maxDoc; } @Override public Number next() { if (!hasNext()) { throw new NoSuchElementException(); } docUpto++; // TODO: make reusable Number return iter.next(); } @Override public void remove() { throw new UnsupportedOperationException(); } } }