/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Mar 22, 2009
*/
package com.bigdata.counters.store;
import java.util.Arrays;
import java.util.Iterator;
import java.util.UUID;
import java.util.Vector;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.ASCIIKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.DefaultInstrumentFactory;
import com.bigdata.counters.History;
import com.bigdata.counters.HistoryInstrument;
import com.bigdata.counters.ICounter;
import com.bigdata.counters.ICounterNode;
import com.bigdata.counters.ICounterSet;
import com.bigdata.counters.IHistoryEntry;
import com.bigdata.counters.IInstrument;
import com.bigdata.counters.PeriodEnum;
import com.bigdata.counters.History.SampleIterator;
import com.bigdata.counters.ICounterSet.IInstrumentFactory;
import com.bigdata.io.SerializerUtil;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.sparse.SparseRowStore;
import com.bigdata.util.Bytes;
/**
* An API encapsulating for writing and querying counter sets. The data are
* written onto an {@link IIndex}. The {@link IIndex} may be local or remote.
* <p>
* The multipart key is used. The first component is the milliseconds of the
* associated timestamp value rounded down to an even number of minutes and
* represented a long. The second component is the fully qualified path of the
* counter. The last component is the exact timestamp (in milliseconds) of the
* sampled counter value, represented as a long. These are formatted into an
* unsigned byte[] following the standard practice.
* <p>
* The value stored under the key is the counter value. Normally counter values
* are doubles or longs, but you can store any of the counter value types which
* are supported by the {@link SparseRowStore}.
* <p>
* Using this approach, writes of the same counter value with different
* timestamps will be recorded as different tuples in the {@link IIndex} and you
* can store counter values sampled at rates of once per second while retaining
* good compression for the keys in the index.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* FIXME Reading through per-minute counters from a CounterSetBTree grows slow
* very quickly.
*
* <pre>
* There are 21750988 counter values covering Fri Apr 03 15:51:57 EDT 2009 to
* Sat Apr 04 08:45:05 EDT 2009. Took 60 seconds to record each hour of data on
* the disk. 1.2G of XML data expanded to 2.6G on the journal
* </pre>
*
* In order to improve performance, put the counter paths in a separate
* dictionary and apply the regex there. Once we have the set of matched paths
* we can scatter range queries against the BTree and drag back the data for
* those counters (this would also make Unicode counter names viable). If the
* key was then [pathId,timestamp] we could do ordered reads of just the
* necessary key range for each desired counter. Prefix compression would still
* be efficent for this representation. While the data arrive in history blocks,
* we would still need to buffer them for ordered writes since otherwise the
* writes would be scattered by the first key component (pathId).
* <p>
* I would have to encapsulate the counters as a counter for this to work, much
* like the RDF DB. There would be two relations: the dictionary and the
* timestamped values.
* <p>
* Space efficient encoding of the counter values would also help quite a bit -
* it is Java default serialization, but we only store Long, Double or String.
* All values for a given counter should have the same data type (it is required
* by how we allocate the History) so the data type can be part of the
* dictionary and that can be used to decode the value. (If values tend to be
* close then a delta encoding would help.)
*/
public class CounterSetBTree extends BTree {
protected static transient final Logger log = Logger
.getLogger(CounterSetBTree.class);
/**
* @param store
* @param checkpoint
* @param metadata
*/
public CounterSetBTree(IRawStore store, Checkpoint checkpoint,
IndexMetadata metadata, boolean readOnly) {
super(store, checkpoint, metadata, readOnly);
}
static private final transient int INITIAL_CAPACITY = Bytes.kilobyte32;
/**
* Create a new instance.
*
* @param store
* The backing store.
*
* @return The new instance.
*/
static public CounterSetBTree create(final IRawStore store) {
final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
metadata.setBTreeClassName(CounterSetBTree.class.getName());
metadata.setTupleSerializer(new CounterSetBTreeTupleSerializer(
new ASCIIKeyBuilderFactory(INITIAL_CAPACITY)));
return (CounterSetBTree) BTree.create(store, metadata);
}
static public CounterSetBTree createTransient() {
final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
metadata.setBTreeClassName(CounterSetBTree.class.getName());
metadata.setTupleSerializer(new CounterSetBTreeTupleSerializer(
new ASCIIKeyBuilderFactory(INITIAL_CAPACITY)));
return (CounterSetBTree) BTree.createTransient(metadata);
}
/**
* A representation of a timestamped performance counter value as stored in
* the {@link CounterSetBTree}. The minutes, path, and timestamp fields are
* recovered from the key. The counter value is recovered from the value.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
static public class Entry {
// key
public final String path;
public final long timestamp;
// value
public final Object value;
public Entry(final long timestamp,
final String path, final Object value) {
this.timestamp = timestamp;
this.path = path;
this.value = value;
}
public String toString() {
return getClass().getName()+//
"{ path="+path+//
", value="+value+//
", timestamp="+timestamp+//
"}";
}
/**
* Return the depth of the path in the performance counter hierarchy
* (counts the #of '/' characters in the path).
*
* @return The depth.
*/
public int getDepth() {
int depth = 0;
final int len = path.length();
for (int i = 0; i < len; i++) {
if (path.charAt(i) == '/') {
depth++;
}
}
return depth;
}
}
/**
* Encapsulates key and value formation. The key is formed from the minutes,
* the path, and the timestamp. The value is the performance counter value
* for a specific timestamp.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
static protected class CounterSetBTreeTupleSerializer extends
DefaultTupleSerializer<Object, Entry> {
/**
*
*/
private static final long serialVersionUID = -887369151228567134L;
/**
* De-serialization ctor.
*/
public CounterSetBTreeTupleSerializer() {
super();
}
/**
* Ctor when creating a new instance.
*
* @param keyBuilderFactory
*/
public CounterSetBTreeTupleSerializer(
final IKeyBuilderFactory keyBuilderFactory) {
super(keyBuilderFactory);
}
/**
* Return the unsigned byte[] key.
*
* @param obj
* An {@link ICounter} or {@link Entry}.
*/
@Override
public byte[] serializeKey(final Object obj) {
if (obj == null)
throw new IllegalArgumentException();
if(obj instanceof ICounter) {
return serializeKey((ICounter)obj);
} else if(obj instanceof Entry) {
return serializeKey((Entry)obj);
} else {
throw new UnsupportedOperationException(obj.getClass().getName());
}
}
public byte[] serializeKey(final ICounter c) {
final long timestamp = c.lastModified();
return getKeyBuilder().reset()//
.append(TimeUnit.MILLISECONDS.toMinutes(timestamp))//
.appendASCII(c.getPath())//
.append(timestamp)//
.getKey();
}
public byte[] serializeKey(final Entry e) {
return getKeyBuilder().reset()//
.append(TimeUnit.MILLISECONDS.toMinutes(e.timestamp))//
.appendASCII(e.path)//
.append(e.timestamp)//
.getKey();
}
/**
* Overridden to serialize just {@link Entry#value} as the value
* component of the B+Tree tuple.
*/
@Override
public byte[] serializeVal(final Entry value) {
return SerializerUtil.serialize(value.value);
}
public Entry deserialize(final ITuple tuple) {
final byte[] key = tuple.getKey();
// final long minutes = KeyBuilder.decodeLong(key, 0/* off */);
final String path = KeyBuilder.decodeASCII(key,
Bytes.SIZEOF_LONG/* off */, key.length
- (2 * Bytes.SIZEOF_LONG)/* len */);
final long timestamp = KeyBuilder.decodeLong(key, key.length
- Bytes.SIZEOF_LONG/* off */);
// @todo tuple.getValueStream()
final Object value = SerializerUtil.deserialize(tuple.getValue());
return new Entry(timestamp, path, value);
}
}
/**
* Handles efficient writes of counters with {@link History} data. The shape
* of the data is changed so that the resulting writes on the BTree will be
* ordered. This is both faster and also results in a smaller size on the
* size (since leaves are not updated once they are written to the store).
* For a counter without history, the current value of the counter will be
* written on the BTree.
*/
public void writeHistory(final Iterator<ICounter> src) {
final long begin = System.currentTimeMillis();
final Vector<KVO<Entry>> v = new Vector<KVO<Entry>>();
final CounterSetBTreeTupleSerializer tupleSer = (CounterSetBTreeTupleSerializer) getIndexMetadata()
.getTupleSerializer();
while (src.hasNext()) {
final ICounter c = src.next();
final String path = c.getPath();
if (c.getInstrument() instanceof HistoryInstrument) {
final History h = ((HistoryInstrument) (c.getInstrument()))
.getHistory();
final SampleIterator sitr = h.iterator();
while (sitr.hasNext()) {
final IHistoryEntry e = sitr.next();
final Entry entry = new Entry(e.lastModified(), path, e
.getValue());
final byte[] key = tupleSer.serializeKey(entry);
final byte[] val = tupleSer.serializeVal(entry);//.value);
v.add(new KVO<Entry>(key, val, entry));
}
} else {
final Entry entry = new Entry(c.lastModified(), path, c
.getValue());
final byte[] key = tupleSer.serializeKey(entry);
final byte[] val = tupleSer.serializeVal(entry);
v.add(new KVO<Entry>(key, val, entry));
}
}
// to array
final KVO[] a = v.toArray(new KVO[v.size()]);
// order by the key.
Arrays.sort(a);
long nwritten = 0;
// ordered write on the BTree.
for (KVO t : a) {
/*
* Note: Don't overwrite if we already have the timestamped counter
* value in the store.
*/
if (!super.contains(t.key)) {
super.insert(t.key, t.val);
nwritten++;
}
}
final long elapsed = System.currentTimeMillis()-begin;
if(log.isInfoEnabled()) {
log.info("Wrote " + nwritten + " of " + a.length + " tuples in "
+ elapsed + "ms");
}
}
/**
* Writes the <strong>current</strong> value of each visited
* {@link ICounter} on the store.
* <p>
* Note: This presumes that the counters are associated with scalar values
* (rather than {@link History}s).
* <p>
* Note that the counter set iterator will normally be in alpha order
* already and all samples should be close to the same minute, so this is
* already efficient for local operations.
*
* @todo More efficient storage for Double, Long, Integer and String values
* (this is using Java default serialization)?
*/
public void writeCurrent(final Iterator<ICounter> src) {
while (src.hasNext()) {
final ICounter c = src.next();
if(log.isDebugEnabled()) {
log.debug(c);
}
// if (c.getInstrument() instanceof HistoryInstrument) {
//
// /*
// * This handles a history counter. However, loading a set
// * history counters will cause writes to be scattered across the
// * index since the counters are processed in alpha (path) order
// * but each counter has a history (ascending minutes). In order
// * to be efficient, the histories need to be converted into a
// * KVO[] and then sorted before doing a bulk insert.
// */
//
// final String path = c.getPath();
//
// final History h = ((HistoryInstrument)c.getInstrument()).getHistory();
//
// final SampleIterator sitr = h.iterator();
//
// while(sitr.hasNext()) {
//
// final IHistoryEntry hentry = sitr.next();
//
// // entry reporting the average value for the history slot.
// final Entry entry = new Entry(hentry.lastModified(), path,
// hentry.getValue());
//
// insert(entry, entry.value);
//
// }
//
// } else {
// just the current value of the counter.
insert(c, c.getValue());
// }
}
}
/**
* <strong>The toTime needs to be ONE (1) <i>unit</i> beyond the time of
* interest since the minutes come first in the key. If you do not follow
* this rule then you can miss out on the last <i>unit</i> worth of data.</strong>
*
* @param fromTime
* The first time whose counters will be visited (in
* milliseconds).
* @param toTime
* The first time whose counters WILL NOT be visited (in
* milliseconds).
* @param unit
* The unit of aggregation for the reported counters.
* @param filter
* Only paths matched by the filter will be accepted (optional).
* @param depth
* When non-zero, only counters whose depth is LTE to the
* specified <i>depth</i> will be returned.
*
* @return A collection of the selected performance counters together with
* their ordered timestamped values for the specified time period.
*
* @todo In an act of cowardice, this assumes that the counter paths are
* ASCII and encodes them as such. This allows us to decode the
* counter path since it is not a compressed sort key. If we don't
* take this "hack" then we need a 2nd index to resolve the Unicode
* path from the sort key (once we hack off the leading minutes
* component).
* <p>
* The other problem is that tacking the milliseconds onto the end of
* the key might break the natural order of the counter paths in the
* index.
* <p>
* The two index approach is not so bad. The main drawback is that it
* can't be encapsulated as easily.
*/
public CounterSet rangeIterator(long fromTime, long toTime,
final TimeUnit unit, final Pattern filter, final int depth) {
if (fromTime < 0)
throw new IllegalArgumentException();
if (toTime < 0)
throw new IllegalArgumentException();
if (unit == null)
throw new IllegalArgumentException();
if (fromTime == 0L) {
/*
* Default is the first available timestamp.
*/
fromTime = getFirstTimestamp();
}
if (toTime == 0L || toTime == Long.MAX_VALUE) {
/*
* Default is the last available timestamp.
*/
toTime = getLastTimestamp();
}
/*
* Convert the covered time span into the caller's unit of aggregation.
*
* Note: The +1 is required to allocate enough slots in the History.
* Without it the History class can overwrite the first slot, which will
* cause the data to be underreported for the first time period.
*/
final long nslots = unit.convert(toTime, TimeUnit.MILLISECONDS)
- unit.convert(fromTime, TimeUnit.MILLISECONDS) + 1;
if (nslots > Integer.MAX_VALUE)
throw new IllegalArgumentException("too many samples");
final CounterSetBTreeTupleSerializer tupleSer = (CounterSetBTreeTupleSerializer) getIndexMetadata()
.getTupleSerializer();
final IKeyBuilder keyBuilder = getIndexMetadata().getTupleSerializer()
.getKeyBuilder();
/*
* Note: The first field in the key is the counter timestamp converted
* to minutes since the epoch. Therefore we need to take the fromTime
* milliseconds and convert it to minutes. Since that conversion
* truncates the value, we will always have a fromKey that is EQ to the
* minute in which the counters with a [fromTime] timestamp would be
* found.
*/
final long fromMinutes = TimeUnit.MILLISECONDS.toMinutes(fromTime);
final byte[] fromKey = keyBuilder.reset().append(fromMinutes).getKey();
/*
* Note: The [toKey] needs to be strictly GT the minute in which the
* [toTime] would be found. This may overscan, but that is better than
* failing to scan enough. Any overscan is filtered out below.
*/
final long toMinutes = TimeUnit.MILLISECONDS.toMinutes(toTime
+ TimeUnit.MINUTES.toMillis(1));
final byte[] toKey = keyBuilder.reset().append(toMinutes).getKey();
if(log.isInfoEnabled()) {
log.info("fromTime=" + fromTime + "ms (" + fromMinutes
+ "m), toTime=" + toTime + "ms (" + toMinutes
+ "m), units=" + unit + ", nslots=" + nslots);
}
// iterator scanning the counters.
final ITupleIterator itr = rangeIterator(fromKey, toKey);
// #of distinct counter paths selected by the query.
int nselected = 0;
// #of timestamp counter values accepted.
long nvalues = 0;
// #of tuples (aka timestamped counter values) visited.
long nvisited = 0;
// counters are inserted into this collection.
final CounterSet counters = new CounterSet();
// factory for history counters.
final IInstrumentFactory instrumentFactory = new DefaultInstrumentFactory(
(int) nslots, PeriodEnum.getValue(unit), false/* overwrite */);
while (itr.hasNext()) {
final ITuple tuple = itr.next();
nvisited++;
final Entry entry = tupleSer.deserialize(tuple);
if (fromTime < entry.timestamp || toTime >= entry.timestamp) {
/*
* Due to the leading [minutes] field in the key there can be
* some underscan and overscan of the index. Therefore we filter
* to ensure that only timestamps which are strictly within the
* specified milliseconds are extracted.
*/
if (log.isTraceEnabled()) {
log.trace("Rejected: minutes="
+ TimeUnit.MILLISECONDS.toMinutes(entry.timestamp)
+ " : " + entry.path);
}
}
if (depth != 0 && depth > entry.getDepth()) {
if (log.isTraceEnabled()) {
log.trace("Rejected: minutes="
+ TimeUnit.MILLISECONDS.toMinutes(entry.timestamp)
+ " : " + entry.path);
}
}
if (filter != null && !filter.matcher(entry.path).matches()) {
if (log.isTraceEnabled()) {
log.trace("Rejected: minutes="
+ TimeUnit.MILLISECONDS.toMinutes(entry.timestamp)
+ " : " + entry.path);
}
continue;
}
ICounterNode c = counters.getPath(entry.path);
final IInstrument inst;
if (c == null) {
// log first time matched for each path.
if (log.isDebugEnabled()) {
log.debug("Matched: ndistinct=" + nselected + ", "
+ entry.path);
}
nselected++;
inst = instrumentFactory.newInstance(entry.value.getClass());
c = counters.addCounter(entry.path, inst);
} else if (c instanceof ICounterSet) {
log.error("CounterSet exists for counter path: " + entry.path);
continue;
} else {
inst = ((ICounter) c).getInstrument();
}
inst.setValue(entry.value, entry.timestamp);
nvalues++;
}
if (log.isInfoEnabled())
log.info("nselected=" + nselected + ", nvalues=" + nvalues
+ ", nvisited=" + nvisited);
return counters;
}
/**
* Return the timestamp associated with the first performance counter value.
*
* @return The timestamp -or- 0L if there are no performance counter values.
*/
public long getFirstTimestamp() {
if (getEntryCount() == 0)
return 0L;
return ((Entry) rangeIterator(null, null, 1/* capacity */,
IRangeQuery.DEFAULT, null/* filter */).next().getObject()).timestamp;
}
/**
* Return the timestamp associated with the last performance counter value.
*
* @return The timestamp -or- 0L if there are no performance counter values.
*/
public long getLastTimestamp() {
if (getEntryCount() == 0)
return 0L;
return ((Entry) rangeIterator(null, null, 1/* capacity */,
IRangeQuery.DEFAULT | IRangeQuery.REVERSE, null/* filter */)
.next().getObject()).timestamp;
}
}