CounterSetBTree.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Mar 22, 2009
 */

package com.bigdata.counters.store;

import java.util.Arrays;
import java.util.Iterator;
import java.util.UUID;
import java.util.Vector;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.ASCIIKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.DefaultInstrumentFactory;
import com.bigdata.counters.History;
import com.bigdata.counters.HistoryInstrument;
import com.bigdata.counters.ICounter;
import com.bigdata.counters.ICounterNode;
import com.bigdata.counters.ICounterSet;
import com.bigdata.counters.IHistoryEntry;
import com.bigdata.counters.IInstrument;
import com.bigdata.counters.PeriodEnum;
import com.bigdata.counters.History.SampleIterator;
import com.bigdata.counters.ICounterSet.IInstrumentFactory;
import com.bigdata.io.SerializerUtil;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.sparse.SparseRowStore;
import com.bigdata.util.Bytes;

/**
 * An API encapsulating for writing and querying counter sets. The data are
 * written onto an {@link IIndex}. The {@link IIndex} may be local or remote.
 * <p>
 * The multipart key is used. The first component is the milliseconds of the
 * associated timestamp value rounded down to an even number of minutes and
 * represented a long. The second component is the fully qualified path of the
 * counter. The last component is the exact timestamp (in milliseconds) of the
 * sampled counter value, represented as a long. These are formatted into an
 * unsigned byte[] following the standard practice.
 * <p>
 * The value stored under the key is the counter value. Normally counter values
 * are doubles or longs, but you can store any of the counter value types which
 * are supported by the {@link SparseRowStore}.
 * <p>
 * Using this approach, writes of the same counter value with different
 * timestamps will be recorded as different tuples in the {@link IIndex} and you
 * can store counter values sampled at rates of once per second while retaining
 * good compression for the keys in the index.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 * 
 * FIXME Reading through per-minute counters from a CounterSetBTree grows slow
 * very quickly.
 * 
 * <pre>
 * There are 21750988 counter values covering Fri Apr 03 15:51:57 EDT 2009 to
 * Sat Apr 04 08:45:05 EDT 2009. Took 60 seconds to record each hour of data on
 * the disk.  1.2G of XML data expanded to 2.6G on the journal
 * </pre>
 * 
 * In order to improve performance, put the counter paths in a separate
 * dictionary and apply the regex there. Once we have the set of matched paths
 * we can scatter range queries against the BTree and drag back the data for
 * those counters (this would also make Unicode counter names viable). If the
 * key was then [pathId,timestamp] we could do ordered reads of just the
 * necessary key range for each desired counter. Prefix compression would still
 * be efficent for this representation. While the data arrive in history blocks,
 * we would still need to buffer them for ordered writes since otherwise the
 * writes would be scattered by the first key component (pathId).
 * <p>
 * I would have to encapsulate the counters as a counter for this to work, much
 * like the RDF DB. There would be two relations: the dictionary and the
 * timestamped values.
 * <p>
 * Space efficient encoding of the counter values would also help quite a bit -
 * it is Java default serialization, but we only store Long, Double or String.
 * All values for a given counter should have the same data type (it is required
 * by how we allocate the History) so the data type can be part of the
 * dictionary and that can be used to decode the value. (If values tend to be
 * close then a delta encoding would help.)
 */
public class CounterSetBTree extends BTree {

    protected static transient final Logger log = Logger
            .getLogger(CounterSetBTree.class);
    
    /**
     * @param store
     * @param checkpoint
     * @param metadata
     */
    public CounterSetBTree(IRawStore store, Checkpoint checkpoint,
            IndexMetadata metadata, boolean readOnly) {

        super(store, checkpoint, metadata, readOnly);

    }

    static private final transient int INITIAL_CAPACITY = Bytes.kilobyte32;

    /**
     * Create a new instance.
     * 
     * @param store
     *            The backing store.
     * 
     * @return The new instance.
     */
    static public CounterSetBTree create(final IRawStore store) {

        final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());

        metadata.setBTreeClassName(CounterSetBTree.class.getName());

        metadata.setTupleSerializer(new CounterSetBTreeTupleSerializer(
                new ASCIIKeyBuilderFactory(INITIAL_CAPACITY)));

        return (CounterSetBTree) BTree.create(store, metadata);

    }

    static public CounterSetBTree createTransient() {

        final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());

        metadata.setBTreeClassName(CounterSetBTree.class.getName());

        metadata.setTupleSerializer(new CounterSetBTreeTupleSerializer(
                new ASCIIKeyBuilderFactory(INITIAL_CAPACITY)));

        return (CounterSetBTree) BTree.createTransient(metadata);

    }

    /**
     * A representation of a timestamped performance counter value as stored in
     * the {@link CounterSetBTree}. The minutes, path, and timestamp fields are
     * recovered from the key. The counter value is recovered from the value.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     * @version $Id$
     */
    static public class Entry {

        // key
        public final String path;
        public final long timestamp;
        
        // value
        public final Object value;
       
        public Entry(final long timestamp,
                final String path, final Object value) {

            this.timestamp = timestamp;
            this.path = path;
            this.value = value;
            
        }
        
        public String toString() {
            
            return getClass().getName()+//
            "{ path="+path+//
            ", value="+value+//
            ", timestamp="+timestamp+//
            "}";
            
        }

        /**
         * Return the depth of the path in the performance counter hierarchy
         * (counts the #of '/' characters in the path).
         * 
         * @return The depth.
         */
        public int getDepth() {
            int depth = 0;
            final int len = path.length();
            for (int i = 0; i < len; i++) {
                if (path.charAt(i) == '/') {
                    depth++;
                }
            }
            return depth;
        }
        
    }
    
    /**
     * Encapsulates key and value formation. The key is formed from the minutes,
     * the path, and the timestamp. The value is the performance counter value
     * for a specific timestamp.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     * @version $Id$
     */
    static protected class CounterSetBTreeTupleSerializer extends
            DefaultTupleSerializer<Object, Entry> {

        /**
         * 
         */
        private static final long serialVersionUID = -887369151228567134L;

        /**
         * De-serialization ctor.
         */
        public CounterSetBTreeTupleSerializer() {

            super();

        }

        /**
         * Ctor when creating a new instance.
         * 
         * @param keyBuilderFactory
         */
        public CounterSetBTreeTupleSerializer(
                final IKeyBuilderFactory keyBuilderFactory) {

            super(keyBuilderFactory);

        }

        /**
         * Return the unsigned byte[] key.
         * 
         * @param obj
         *            An {@link ICounter} or {@link Entry}.
         */
        @Override
        public byte[] serializeKey(final Object obj) {

            if (obj == null)
                throw new IllegalArgumentException();
            
            if(obj instanceof ICounter) {
                
                return serializeKey((ICounter)obj);
                
            } else if(obj instanceof Entry) {
                
                return serializeKey((Entry)obj);
                
            } else {
                
                throw new UnsupportedOperationException(obj.getClass().getName());
                
            }
            
        }

        public byte[] serializeKey(final ICounter c) {

            final long timestamp = c.lastModified();

            return getKeyBuilder().reset()//
                    .append(TimeUnit.MILLISECONDS.toMinutes(timestamp))//
                    .appendASCII(c.getPath())//
                    .append(timestamp)//
                    .getKey();

        }
        
        public byte[] serializeKey(final Entry e) {

            return getKeyBuilder().reset()//
                    .append(TimeUnit.MILLISECONDS.toMinutes(e.timestamp))//
                    .appendASCII(e.path)//
                    .append(e.timestamp)//
                    .getKey();
            
        }

		/**
		 * Overridden to serialize just {@link Entry#value} as the value
		 * component of the B+Tree tuple.
		 */
        @Override
        public byte[] serializeVal(final Entry value) {

			return SerializerUtil.serialize(value.value);

        }
        
        public Entry deserialize(final ITuple tuple) {

            final byte[] key = tuple.getKey();

            // final long minutes = KeyBuilder.decodeLong(key, 0/* off */);

            final String path = KeyBuilder.decodeASCII(key,
                    Bytes.SIZEOF_LONG/* off */, key.length
                            - (2 * Bytes.SIZEOF_LONG)/* len */);

            final long timestamp = KeyBuilder.decodeLong(key, key.length
                    - Bytes.SIZEOF_LONG/* off */);

            // @todo tuple.getValueStream()
            final Object value = SerializerUtil.deserialize(tuple.getValue());

            return new Entry(timestamp, path, value);
            
        }

    }

    /**
     * Handles efficient writes of counters with {@link History} data. The shape
     * of the data is changed so that the resulting writes on the BTree will be
     * ordered. This is both faster and also results in a smaller size on the
     * size (since leaves are not updated once they are written to the store).
     * For a counter without history, the current value of the counter will be
     * written on the BTree.
     */
    public void writeHistory(final Iterator<ICounter> src) {
        
        final long begin = System.currentTimeMillis();
        
        final Vector<KVO<Entry>> v = new Vector<KVO<Entry>>();
        
        final CounterSetBTreeTupleSerializer tupleSer = (CounterSetBTreeTupleSerializer) getIndexMetadata()
                .getTupleSerializer();
        
        while (src.hasNext()) {

            final ICounter c = src.next();

            final String path = c.getPath();

            if (c.getInstrument() instanceof HistoryInstrument) {

                final History h = ((HistoryInstrument) (c.getInstrument()))
                        .getHistory();

                final SampleIterator sitr = h.iterator();

                while (sitr.hasNext()) {

                    final IHistoryEntry e = sitr.next();

                    final Entry entry = new Entry(e.lastModified(), path, e
                            .getValue());

                    final byte[] key = tupleSer.serializeKey(entry);

                    final byte[] val = tupleSer.serializeVal(entry);//.value);

                    v.add(new KVO<Entry>(key, val, entry));

                }
                
            } else {
            
                final Entry entry = new Entry(c.lastModified(), path, c
                        .getValue());
                
                final byte[] key = tupleSer.serializeKey(entry);
                
                final byte[] val = tupleSer.serializeVal(entry);
                
                v.add(new KVO<Entry>(key, val, entry));
                
            }
            
        }

        // to array
        final KVO[] a = v.toArray(new KVO[v.size()]);
        
        // order by the key.
        Arrays.sort(a);

        long nwritten = 0;
        
        // ordered write on the BTree.
        for (KVO t : a) {

            /*
             * Note: Don't overwrite if we already have the timestamped counter
             * value in the store.
             */
            if (!super.contains(t.key)) {

                super.insert(t.key, t.val);

                nwritten++;
                
            }
            
        }

        final long elapsed = System.currentTimeMillis()-begin;
        
        if(log.isInfoEnabled()) {
            
            log.info("Wrote " + nwritten + " of " + a.length + " tuples in "
                    + elapsed + "ms");
            
        }
        
    }
    
    /**
     * Writes the <strong>current</strong> value of each visited
     * {@link ICounter} on the store.
     * <p>
     * Note: This presumes that the counters are associated with scalar values
     * (rather than {@link History}s).
     * <p>
     * Note that the counter set iterator will normally be in alpha order
     * already and all samples should be close to the same minute, so this is
     * already efficient for local operations.
     * 
     * @todo More efficient storage for Double, Long, Integer and String values
     *       (this is using Java default serialization)?
     */
    public void writeCurrent(final Iterator<ICounter> src) {

        while (src.hasNext()) {

            final ICounter c = src.next();

            if(log.isDebugEnabled()) {
                
                log.debug(c);
                
            }
         
//            if (c.getInstrument() instanceof HistoryInstrument) {
//
//                /*
//                 * This handles a history counter. However, loading a set
//                 * history counters will cause writes to be scattered across the
//                 * index since the counters are processed in alpha (path) order
//                 * but each counter has a history (ascending minutes). In order
//                 * to be efficient, the histories need to be converted into a
//                 * KVO[] and then sorted before doing a bulk insert.
//                 */
//                
//                final String path = c.getPath();
//                
//                final History h = ((HistoryInstrument)c.getInstrument()).getHistory();
//                
//                final SampleIterator sitr = h.iterator();
//                
//                while(sitr.hasNext()) {
//                    
//                    final IHistoryEntry hentry = sitr.next();
//                    
//                    // entry reporting the average value for the history slot.
//                    final Entry entry = new Entry(hentry.lastModified(), path,
//                            hentry.getValue());
//                    
//                    insert(entry, entry.value);
//                    
//                }
//                
//            } else {
                
                // just the current value of the counter.
                insert(c, c.getValue());
                
//            }

        }

    }

    /**
     * <strong>The toTime needs to be ONE (1) <i>unit</i> beyond the time of
     * interest since the minutes come first in the key. If you do not follow
     * this rule then you can miss out on the last <i>unit</i> worth of data.</strong>
     * 
     * @param fromTime
     *            The first time whose counters will be visited (in
     *            milliseconds).
     * @param toTime
     *            The first time whose counters WILL NOT be visited (in
     *            milliseconds).
     * @param unit
     *            The unit of aggregation for the reported counters.
     * @param filter
     *            Only paths matched by the filter will be accepted (optional).
     * @param depth
     *            When non-zero, only counters whose depth is LTE to the
     *            specified <i>depth</i> will be returned.
     * 
     * @return A collection of the selected performance counters together with
     *         their ordered timestamped values for the specified time period.
     * 
     * @todo In an act of cowardice, this assumes that the counter paths are
     *       ASCII and encodes them as such. This allows us to decode the
     *       counter path since it is not a compressed sort key. If we don't
     *       take this "hack" then we need a 2nd index to resolve the Unicode
     *       path from the sort key (once we hack off the leading minutes
     *       component).
     *       <p>
     *       The other problem is that tacking the milliseconds onto the end of
     *       the key might break the natural order of the counter paths in the
     *       index.
     *       <p>
     *       The two index approach is not so bad. The main drawback is that it
     *       can't be encapsulated as easily.
     */
    public CounterSet rangeIterator(long fromTime, long toTime,
            final TimeUnit unit, final Pattern filter, final int depth) {

        if (fromTime < 0)
            throw new IllegalArgumentException();

        if (toTime < 0)
            throw new IllegalArgumentException();
        
        if (unit == null)
            throw new IllegalArgumentException();
        
        if (fromTime == 0L) {
            /*
             * Default is the first available timestamp.
             */
            fromTime = getFirstTimestamp();
        }

        if (toTime == 0L || toTime == Long.MAX_VALUE) {
            /*
             * Default is the last available timestamp.
             */
            toTime = getLastTimestamp();
        }
        
        /*
         * Convert the covered time span into the caller's unit of aggregation.
         * 
         * Note: The +1 is required to allocate enough slots in the History.
         * Without it the History class can overwrite the first slot, which will
         * cause the data to be underreported for the first time period.
         */
        final long nslots = unit.convert(toTime, TimeUnit.MILLISECONDS)
                - unit.convert(fromTime, TimeUnit.MILLISECONDS) + 1;

        if (nslots > Integer.MAX_VALUE)
            throw new IllegalArgumentException("too many samples");

        final CounterSetBTreeTupleSerializer tupleSer = (CounterSetBTreeTupleSerializer) getIndexMetadata()
                .getTupleSerializer();

        final IKeyBuilder keyBuilder = getIndexMetadata().getTupleSerializer()
                .getKeyBuilder();

        /*
         * Note: The first field in the key is the counter timestamp converted
         * to minutes since the epoch. Therefore we need to take the fromTime
         * milliseconds and convert it to minutes. Since that conversion
         * truncates the value, we will always have a fromKey that is EQ to the
         * minute in which the counters with a [fromTime] timestamp would be
         * found.
         */
        final long fromMinutes = TimeUnit.MILLISECONDS.toMinutes(fromTime);
        final byte[] fromKey = keyBuilder.reset().append(fromMinutes).getKey();

        /*
         * Note: The [toKey] needs to be strictly GT the minute in which the
         * [toTime] would be found. This may overscan, but that is better than
         * failing to scan enough. Any overscan is filtered out below.
         */
        final long toMinutes = TimeUnit.MILLISECONDS.toMinutes(toTime
                + TimeUnit.MINUTES.toMillis(1));
        final byte[] toKey = keyBuilder.reset().append(toMinutes).getKey();

        if(log.isInfoEnabled()) {
            
            log.info("fromTime=" + fromTime + "ms (" + fromMinutes
                    + "m), toTime=" + toTime + "ms (" + toMinutes
                    + "m), units=" + unit + ", nslots=" + nslots);

        }

        // iterator scanning the counters.
        final ITupleIterator itr = rangeIterator(fromKey, toKey);

        // #of distinct counter paths selected by the query.
        int nselected = 0;
        
        // #of timestamp counter values accepted.
        long nvalues = 0;
        
        // #of tuples (aka timestamped counter values) visited.
        long nvisited = 0;

        // counters are inserted into this collection.
        final CounterSet counters = new CounterSet();

        // factory for history counters.
        final IInstrumentFactory instrumentFactory = new DefaultInstrumentFactory(
                (int) nslots, PeriodEnum.getValue(unit), false/* overwrite */);

        while (itr.hasNext()) {

            final ITuple tuple = itr.next();
            
            nvisited++;
            
            final Entry entry = tupleSer.deserialize(tuple);

            if (fromTime < entry.timestamp || toTime >= entry.timestamp) {

                /*
                 * Due to the leading [minutes] field in the key there can be
                 * some underscan and overscan of the index. Therefore we filter
                 * to ensure that only timestamps which are strictly within the
                 * specified milliseconds are extracted.
                 */
                
                if (log.isTraceEnabled()) {

                    log.trace("Rejected: minutes="
                            + TimeUnit.MILLISECONDS.toMinutes(entry.timestamp)
                            + " : " + entry.path);
                    
                }

            }

            if (depth != 0 && depth > entry.getDepth()) {

                if (log.isTraceEnabled()) {

                    log.trace("Rejected: minutes="
                            + TimeUnit.MILLISECONDS.toMinutes(entry.timestamp)
                            + " : " + entry.path);
                    
                }

            }
            
            if (filter != null && !filter.matcher(entry.path).matches()) {

                if (log.isTraceEnabled()) {

                    log.trace("Rejected: minutes="
                            + TimeUnit.MILLISECONDS.toMinutes(entry.timestamp)
                            + " : " + entry.path);
                    
                }
                
                continue;
                
            }

            ICounterNode c = counters.getPath(entry.path);
            final IInstrument inst;

            if (c == null) {

                // log first time matched for each path.
                if (log.isDebugEnabled()) {

                    log.debug("Matched: ndistinct=" + nselected + ", "
                            + entry.path);
                    
                }

                nselected++;

                inst = instrumentFactory.newInstance(entry.value.getClass());

                c = counters.addCounter(entry.path, inst);

            } else if (c instanceof ICounterSet) {

                log.error("CounterSet exists for counter path: " + entry.path);

                continue;

            } else {

                inst = ((ICounter) c).getInstrument();

            }

            inst.setValue(entry.value, entry.timestamp);

            nvalues++;
            
        }

        if (log.isInfoEnabled())
            log.info("nselected=" + nselected + ", nvalues=" + nvalues
                    + ", nvisited=" + nvisited);

        return counters;
        
    }

    /**
     * Return the timestamp associated with the first performance counter value.
     * 
     * @return The timestamp -or- 0L if there are no performance counter values.
     */
    public long getFirstTimestamp() {

        if (getEntryCount() == 0)
            return 0L;

        return ((Entry) rangeIterator(null, null, 1/* capacity */,
                IRangeQuery.DEFAULT, null/* filter */).next().getObject()).timestamp;

    }
    
    /**
     * Return the timestamp associated with the last performance counter value.
     * 
     * @return The timestamp -or- 0L if there are no performance counter values.
     */
    public long getLastTimestamp() {

        if (getEntryCount() == 0)
            return 0L;

        return ((Entry) rangeIterator(null, null, 1/* capacity */,
                IRangeQuery.DEFAULT | IRangeQuery.REVERSE, null/* filter */)
                .next().getObject()).timestamp;

    }
    
}