/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.util; import java.util.Arrays; /** Radix sorter for variable-length strings. This class sorts based on the most * significant byte first and falls back to {@link IntroSorter} when the size * of the buckets to sort becomes small. It is <b>NOT</b> stable. * Worst-case memory usage is about {@code 2.3 KB}. * @lucene.internal */ public abstract class MSBRadixSorter extends Sorter { // after that many levels of recursion we fall back to introsort anyway // this is used as a protection against the fact that radix sort performs // worse when there are long common prefixes (probably because of cache // locality) private static final int LEVEL_THRESHOLD = 8; // size of histograms: 256 + 1 to indicate that the string is finished private static final int HISTOGRAM_SIZE = 257; // buckets below this size will be sorted with introsort private static final int LENGTH_THRESHOLD = 100; // we store one histogram per recursion level private final int[][] histograms = new int[LEVEL_THRESHOLD][]; private final int[] endOffsets = new int[HISTOGRAM_SIZE]; private final int[] commonPrefix; private final int maxLength; /** * Sole constructor. * @param maxLength the maximum length of keys, pass {@link Integer#MAX_VALUE} if unknown. */ protected MSBRadixSorter(int maxLength) { this.maxLength = maxLength; this.commonPrefix = new int[Math.min(24, maxLength)]; } /** Return the k-th byte of the entry at index {@code i}, or {@code -1} if * its length is less than or equal to {@code k}. This may only be called * with a value of {@code i} between {@code 0} included and * {@code maxLength} excluded. */ protected abstract int byteAt(int i, int k); /** Get a fall-back sorter which may assume that the first k bytes of all compared strings are equal. */ protected Sorter getFallbackSorter(int k) { return new IntroSorter() { @Override protected void swap(int i, int j) { MSBRadixSorter.this.swap(i, j); } @Override protected int compare(int i, int j) { for (int o = k; o < maxLength; ++o) { final int b1 = byteAt(i, o); final int b2 = byteAt(j, o); if (b1 != b2) { return b1 - b2; } else if (b1 == -1) { break; } } return 0; } @Override protected void setPivot(int i) { pivot.setLength(0); for (int o = k; o < maxLength; ++o) { final int b = byteAt(i, o); if (b == -1) { break; } pivot.append((byte) b); } } @Override protected int comparePivot(int j) { for (int o = 0; o < pivot.length(); ++o) { final int b1 = pivot.byteAt(o) & 0xff; final int b2 = byteAt(j, k + o); if (b1 != b2) { return b1 - b2; } } if (k + pivot.length() == maxLength) { return 0; } return -1 - byteAt(j, k + pivot.length()); } private final BytesRefBuilder pivot = new BytesRefBuilder(); }; } @Override protected final int compare(int i, int j) { throw new UnsupportedOperationException("unused: not a comparison-based sort"); } @Override public void sort(int from, int to) { checkRange(from, to); sort(from, to, 0, 0); } private void sort(int from, int to, int k, int l) { if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) { introSort(from, to, k); } else { radixSort(from, to, k, l); } } private void introSort(int from, int to, int k) { getFallbackSorter(k).sort(from, to); } /** * @param k the character number to compare * @param l the level of recursion */ private void radixSort(int from, int to, int k, int l) { int[] histogram = histograms[l]; if (histogram == null) { histogram = histograms[l] = new int[HISTOGRAM_SIZE]; } else { Arrays.fill(histogram, 0); } final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram); if (commonPrefixLength > 0) { // if there are no more chars to compare or if all entries fell into the // first bucket (which means strings are shorter than k) then we are done // otherwise recurse if (k + commonPrefixLength < maxLength && histogram[0] < to - from) { radixSort(from, to, k + commonPrefixLength, l); } return; } assert assertHistogram(commonPrefixLength, histogram); int[] startOffsets = histogram; int[] endOffsets = this.endOffsets; sumHistogram(histogram, endOffsets); reorder(from, to, startOffsets, endOffsets, k); endOffsets = startOffsets; if (k + 1 < maxLength) { // recurse on all but the first bucket since all keys are equals in this // bucket (we already compared all bytes) for (int prev = endOffsets[0], i = 1; i < HISTOGRAM_SIZE; ++i) { int h = endOffsets[i]; final int bucketLen = h - prev; if (bucketLen > 1) { sort(from + prev, from + h, k + 1, l + 1); } prev = h; } } } // only used from assert private boolean assertHistogram(int commonPrefixLength, int[] histogram) { int numberOfUniqueBytes = 0; for (int freq : histogram) { if (freq > 0) { numberOfUniqueBytes++; } } if (numberOfUniqueBytes == 1) { assert commonPrefixLength >= 1; } else { assert commonPrefixLength == 0 : commonPrefixLength; } return true; } /** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */ private int getBucket(int i, int k) { return byteAt(i, k) + 1; } /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket} * and return a common prefix length for all visited values. * @see #buildHistogram */ private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) { final int[] commonPrefix = this.commonPrefix; int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k); for (int j = 0; j < commonPrefixLength; ++j) { final int b = byteAt(from, k + j); commonPrefix[j] = b; if (b == -1) { commonPrefixLength = j + 1; break; } } int i; outer: for (i = from + 1; i < to; ++i) { for (int j = 0; j < commonPrefixLength; ++j) { final int b = byteAt(i, k + j); if (b != commonPrefix[j]) { commonPrefixLength = j; if (commonPrefixLength == 0) { // we have no common prefix histogram[commonPrefix[0] + 1] = i - from; histogram[b + 1] = 1; break outer; } break; } } } if (i < to) { // the loop got broken because there is no common prefix assert commonPrefixLength == 0; buildHistogram(i + 1, to, k, histogram); } else { assert commonPrefixLength > 0; histogram[commonPrefix[0] + 1] = to - from; } return commonPrefixLength; } /** Build an histogram of the k-th characters of values occurring between * offsets {@code from} and {@code to}, using {@link #getBucket}. */ private void buildHistogram(int from, int to, int k, int[] histogram) { for (int i = from; i < to; ++i) { histogram[getBucket(i, k)]++; } } /** Accumulate values of the histogram so that it does not store counts but * start offsets. {@code endOffsets} will store the end offsets. */ private static void sumHistogram(int[] histogram, int[] endOffsets) { int accum = 0; for (int i = 0; i < HISTOGRAM_SIZE; ++i) { final int count = histogram[i]; histogram[i] = accum; accum += count; endOffsets[i] = accum; } } /** * Reorder based on start/end offsets for each bucket. When this method * returns, startOffsets and endOffsets are equal. * @param startOffsets start offsets per bucket * @param endOffsets end offsets per bucket */ private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) { // reorder in place, like the dutch flag problem for (int i = 0; i < HISTOGRAM_SIZE; ++i) { final int limit = endOffsets[i]; for (int h1 = startOffsets[i]; h1 < limit; h1 = startOffsets[i]) { final int b = getBucket(from + h1, k); final int h2 = startOffsets[b]++; swap(from + h1, from + h2); } } } }