KeyHistogram.java example

Explorer
persistit-master
/**
 * Copyright 2011-2012 Akiban Technologies, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.persistit;

import java.util.ArrayList;
import java.util.List;

/**
 * <p>
 * Accumulate and hold information about the distribution of {@link Key} objects
 * in a Persistit {@link Tree}. This class is used by the
 * {@link Exchange#computeHistogram(Key, Key, int, int, KeyFilter, int)} method
 * to accumulate and return the result of scanning all the keys at a fixed depth
 * within a Tree.
 * <p>
 * </p>
 * The result is represented by a List of {@link KeyCount} objects, each
 * representing a key and a count. The count represents the number of smaller
 * keys in the tree level. With this information client applications can
 * estimate the number of elements between any two keys in the Tree.
 * <p>
 * </p>
 * Application code can specify a <code>keyDepth</code> at which sibling keys
 * are grouped together. For example, suppose a Tree contains keys such as
 * 
 * <pre>
 *   {"BLUE",1}
 *   {"BLUE",2}
 *   {"BLUE",3}
 *   {"RED",1}
 *   {"RED",2}
 * </pre>
 * 
 * If <code>keyDepth=2</code> the resulting histogram will have 5 buckets, each
 * with a <code>count<code> value of 1.  But if <code>keyDepth=1</code> the
 * result will have two buckets, one each for "BLUE" and "RED". Specifying
 * <code>keyDepth=0</code> turns off aggregation by partial key depth.
 * <p>
 * </p>
 * During the aggregation process the {@link Exchange#computeHistogram} method
 * invokes the {@link #addKeyCopy} method for each Key it traverses. Each key is
 * analyzed to determine whether the first <code>keyDepth</code> segments are
 * the same as the previously added Key. If so then the previous count is
 * incremented; otherwise a new KeyCount entry is added to the sample list.
 * <p>
 * </p>
 * 
 * 
 * @author peter
 * 
 */
public class KeyHistogram {

    private final Tree _tree;

    private final Key _startKey;

    private final Key _endKey;

    private final int _requestedSampleSize;

    private final int _treeDepth;

    private final int _keyDepth;

    private final List<KeyCount> _keys = new ArrayList<KeyCount>();

    private int _modulus = 1;

    private int _keyCount = 0;

    private int _pageCount = 0;

    private long _pageBytesTotal = 0;

    private long _pageBytesInUse = 0;

    /**
     * Element in a <code>KeyHistogram</code> that denotes the estimated number
     * of keys in a histogram bucket. This class has two fields representing a
     * key and a count. The count indicates the estimated number of other keys
     * less than this one in the tree level.
     */
    public static class KeyCount {

        final byte[] _bytes;

        int _count;

        private KeyCount(final byte[] bytes, final int count) {
            _bytes = bytes;
            _count = count;
        }

        /**
         * Get the key bytes
         * 
         * @return the bytes of the key
         */
        public byte[] getBytes() {
            return _bytes;
        }

        /**
         * Get the count
         * 
         * @return the count
         */
        public int getCount() {
            return _count;
        }

        private void setCount(final int count) {
            this._count = count;
        }

        @Override
        public String toString() {
            final Key key = new Key((Persistit) null);
            System.arraycopy(_bytes, 0, key.getEncodedBytes(), 0, _bytes.length);
            key.setEncodedSize(_bytes.length);
            return String.format("%,10d %s", _count, key);
        }
    }

    public KeyHistogram(final Tree tree, final Key start, final Key end, final int sampleSize, final int keyDepth,
            final int treeDepth) {
        _tree = tree;
        _startKey = start;
        _endKey = end;
        _requestedSampleSize = sampleSize;
        _keyDepth = keyDepth;
        _treeDepth = treeDepth;
    }

    public Tree getTree() {
        return _tree;
    }

    public Key getStartKey() {
        return _startKey;
    }

    public Key getEndKey() {
        return _endKey;
    }

    public int getKeyCount() {
        return _keyCount;
    }

    public int getRequestedSampleSize() {
        return _requestedSampleSize;
    }

    public int getSampleSize() {
        return _keys.size();
    }

    public List<KeyCount> getSamples() {
        return _keys;
    }

    public int getTreeDepth() {
        return _treeDepth;
    }

    public int getKeyDepth() {
        return _keyDepth;
    }

    public int getPageCount() {
        return _pageCount;
    }

    public long getPageBytesTotal() {
        return _pageBytesTotal;
    }

    public long getPageBytesInUse() {
        return _pageBytesInUse;
    }

    /**
     * Add a key. Keys must be added in key-sort order. If the supplied key is
     * the same as the previously added key up to the segment specified by the
     * keyDepth property, then accumulate to the same KeyCount bucket. Otherwise
     * add a new KeyCount bucket. When the sample list becomes too long, this
     * method removes every other sample and aggregates the count values. By so
     * doing, this method keeps the number of retained samples relatively small.
     * 
     * @param key
     *            The Key to add to the sample set
     */
    void addKeyCopy(final Key key) {
        _keyCount++;
        if (_keyCount % _modulus == 0) {
            final int length = _keyDepth == 0 ? key.getEncodedSize() : key.indexTo(_keyDepth).getIndex();
            final int end = _keys.size() - 1;
            boolean same = false;
            if (end >= 0) {
                final byte[] last = _keys.get(end).getBytes();
                same = last.length == length;
                for (int index = 0; same && index < length; index++) {
                    same &= last[index] == key.getEncodedBytes()[index];
                }
            }
            if (same) {
                _keys.get(end).setCount(_keyCount);
            } else {
                final byte[] bytes = new byte[length];
                System.arraycopy(key.getEncodedBytes(), 0, bytes, 0, length);
                _keys.add(new KeyCount(bytes, _keyCount));
            }
        }
        if (_keys.size() >= _requestedSampleSize * 16) {
            for (int index = (_keys.size() & 0x7FFFFFFE); (index -= 2) >= 0;) {
                _keys.remove(index);
            }
            _modulus *= 2;
        }
    }

    /**
     * Accumulates total number of pages, bytes and bytes-in-use traversed.
     * 
     * @param size
     *            Size of the page
     * @param used
     *            Number of bytes in use in the page.
     */
    void addPage(final int size, final int used) {
        _pageCount++;
        _pageBytesTotal += size;
        _pageBytesInUse += used;
    }

    /**
     * Culls the List of keys down to the requested sample size
     */
    void cull() {
        final int have = _keys.size();
        final int want = _requestedSampleSize;
        int counter = have;
        for (int index = have; --index >= 0;) {
            counter += want;
            if (counter <= have) {
                _keys.remove(index);
            } else {
                counter -= have;
            }
        }
        if (_keys.size() > want) {
            _keys.remove(0);
        }
    }
}