/*
* Copyright © 2014 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.api.dataset.lib;
import co.cask.cdap.api.data.batch.BatchReadable;
import co.cask.cdap.api.data.batch.BatchWritable;
import co.cask.cdap.api.data.batch.IteratorBasedSplitReader;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.data.batch.SplitReader;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.api.dataset.table.Table;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Defines a Dataset implementation for managing time series data. This class offers simple ways to process read
* operations for time ranges.
*
* <p>
* This Dataset works by partitioning time into bins representing time intervals. Entries added to the Dataset
* are added to a bin based on their timestamp and row key. Hence, every row in the underlying table contains entries
* that share the same time interval and row key. Data for each entry is stored in separate columns.
* </p>
*
* <p>
* A user can set the time interval length for partitioning data into rows (as defined by
* <code>timeIntervalToStorePerRow</code> in the {@link co.cask.cdap.api.dataset.DatasetSpecification} properties).
* This interval should be chosen according to the use case at hand. In general, larger time interval sizes mean
* faster reading of small-to-medium time ranges (range size up to several time intervals) of entries data,
* while having slower reading of very small time ranges of entries data (range size a small portion of the time
* interval). Using a larger time interval also helps with faster batched writing of entries.
* </p>
*
* <p>Vice versa, setting smaller time intervals provides faster reading of very small time ranges of entries data,
* but has slower batched writing of entries.
* </p>
*
* <p>
* As expected, a larger time interval means that more data will be stored per row. A user should
* generally avoid storing more than 50 megabytes of data per row, since it affects performance.
* </p>
* <p>
* The default value for time interval length is one hour and is generally suggested for users to use a value of
* between one minute and several hours. In cases where the amount of written entries is small, the rule of thumb is:
* <br/><br/>
* <code>row partition interval size = 5 * (average size of the time range to be read)</code>
* </p>
*
* <p>
* TimeseriesTable supports tagging, where each entry is (optionally) labeled with a set of tags used for filtering of
* items during data retrievals. For an entry to be retrievable using a given tag, the tag must be provided when
* the entry was written. If multiple tags are provided during reading, an entry must contain every one of these tags
* in order to qualify for return.
* </p>
*
* <p>
* Due to the data format used for storing, filtering by tags during reading is done on client-side (not on a cluster).
* At the same time, filtering by entry keys happens on the server side, which is much more efficient performance-wise.
* Depending on the use-case you may want to push some of the tags you would use into the entry key for faster reading.
* </p>
*
* <p>
* Notes on implementation:
* <ol>
* <li>
* This implementation does NOT address the RegionServer hot-spotting issue that appears when writing rows with
* monotonically increasing/decreasing keys into HBase. This point is relevant for HBase-backed data stores.
* To avoid this problem, a user should not write all data under the same metric key. In general, writes will be
* as distributed as the number of different metric keys the data is written for. Having a single metric key would
* mean hitting a single RegionServer at any given point of time with all writes; this is generally not desirable.
* </li>
* <li>
* The current implementation (including the format of the stored data) is heavily affected by the
* {@link co.cask.cdap.api.dataset.table.Table} API which is used "under-the-hood". In particular the
* implementation is constrained by the absence of a <code>readHigherOrEq()</code> method in the
* {@link co.cask.cdap.api.dataset.table.Table} API, which would return the next row with key greater
* or equals to the given.
* </li>
* <li>
* The client code should not rely on the implementation details as they may be changed without notice.
* </li>
* </ol>
* </p>
*
* @see CounterTimeseriesTable
*/
public class TimeseriesTable extends TimeseriesDataset
implements BatchReadable<byte[], TimeseriesTable.Entry>, BatchWritable<byte[], TimeseriesTable.Entry> {
/**
* Creates an instance of the table.
*/
public TimeseriesTable(DatasetSpecification spec, Table table) {
super(spec, table);
}
/**
* Writes an entry to the Dataset.
*
* @param entry entry to write
*/
public final void write(Entry entry) {
write(entry.getKey(), entry.getValue(), entry.getTimestamp(), entry.getTags());
}
/**
* Reads entries for a given time range and returns an <code>Iterator<Entry></code>.
* Provides the same functionality as {@link #read(byte[], long, long, byte[][]) read(byte[], long, long, byte[]...)}
* but accepts additional parameters for pagination purposes.
* NOTE: A limit is placed on the max number of time intervals to be scanned during a read, as defined by
* {@link #MAX_ROWS_TO_SCAN_PER_READ}.
*
* @param key key of the entries to read
* @param startTime defines start of the time range to read, inclusive
* @param endTime defines end of the time range to read, inclusive
* @param offset the number of initial entries to ignore and not add to the results
* @param limit upper limit on number of results returned. If limit is exceeded, the first <code>limit</code> results
* are returned
* @param tags a set of tags which entries returned must contain. Tags for entries are defined at write-time and an
* entry is only returned if it contains all of these tags.
*
* @return an iterator over entries that satisfy provided conditions
* @throws IllegalArgumentException when provided condition is incorrect
*/
public final Iterator<Entry> read(byte[] key, long startTime, long endTime,
int offset, final int limit, byte[]... tags) {
final Iterator<Entry> iterator = read(key, startTime, endTime, tags);
int advance = offset;
while (advance > 0 && iterator.hasNext()) {
iterator.next();
advance--;
}
return new Iterator<Entry>() {
int count = 0;
@Override
public boolean hasNext() {
return count < limit && iterator.hasNext();
}
@Override
public Entry next() {
if (hasNext()) {
count++;
return iterator.next();
}
throw new NoSuchElementException();
}
@Override
public void remove() {
iterator.remove();
}
};
}
/**
* Reads entries for a given time range and returns an <code>Iterator<Entry></code>.
* NOTE: A limit is placed on the max number of time intervals to be scanned during a read, as defined by
* {@link #MAX_ROWS_TO_SCAN_PER_READ}.
*
* @param key key of the entries to read
* @param startTime defines start of the time range to read, inclusive
* @param endTime defines end of the time range to read, inclusive
* @param tags a set of tags which entries returned must contain. Tags for entries are defined at write-time and an
* entry is only returned if it contains all of these tags.
*
* @return an iterator over entries that satisfy provided conditions
*/
public Iterator<Entry> read(byte[] key, long startTime, long endTime, byte[]... tags) {
final Iterator<TimeseriesDataset.Entry> internalIterator = readInternal(key, startTime, endTime, tags);
return new Iterator<Entry>() {
@Override
public boolean hasNext() {
return internalIterator.hasNext();
}
@Override
public Entry next() {
TimeseriesDataset.Entry entry = internalIterator.next();
return new Entry(entry.getKey(), entry.getValue(), entry.getTimestamp(), entry.getTags());
}
@Override
public void remove() {
internalIterator.remove();
}
};
}
/**
* A method for using a Dataset as input for a MapReduce job.
*/
private static final class InputSplit extends Split {
private byte[] key;
private long startTime;
private long endTime;
private byte[][] tags;
private InputSplit(byte[] key, long startTime, long endTime, byte[][] tags) {
this.key = key;
this.startTime = startTime;
this.endTime = endTime;
this.tags = tags;
}
}
/**
* Defines input selection for batch jobs.
*
* @param splitsCount number of parts to split the data selection into
* @param key key of the entries to read
* @param startTime defines start of the time range to read, inclusive
* @param endTime defines end of the time range to read, inclusive
* @param tags a set of tags which entries returned must contain. Tags for entries are defined at write-time and an
* entry is only returned if it contains all of these tags.
* @return the list of splits
*/
public List<Split> getInputSplits(int splitsCount, byte[] key, long startTime, long endTime, byte[]... tags) {
long timeIntervalPerSplit = (endTime - startTime) / splitsCount;
// we don't want splits to be empty
timeIntervalPerSplit = timeIntervalPerSplit > 0 ? timeIntervalPerSplit : 1;
List<Split> splits = new ArrayList<>();
long start;
for (start = startTime; start + timeIntervalPerSplit <= endTime; start += timeIntervalPerSplit) {
splits.add(new InputSplit(key, start, start + timeIntervalPerSplit, tags));
}
// last interval should cover all up to the endTime
if (start + timeIntervalPerSplit < endTime) {
splits.add(new InputSplit(key, start + timeIntervalPerSplit, endTime, tags));
}
return splits;
}
@Override
public List<Split> getSplits() {
throw new UnsupportedOperationException("Cannot use TimeSeriesTable as input for Batch directly. " +
"Use getInput(...) and call " +
"MapReduceContext.setInput(tsTable, splits) in the " +
"beforeSubmit(MapReduceContext context) method of the MapReduce app.");
}
@Override
public SplitReader<byte[], Entry> createSplitReader(final Split split) {
return new TimeseriesTableRecordsReader();
}
/**
* Writes an entry to the Dataset. This method overrides {@code write(key, value)} in {@link BatchWritable}.
* The key is ignored in this method and instead it uses the key provided in the <code>Entry</code> object.
*
* @param key row key to write to. Value is ignored
* @param value entry to write. The key used to write to the table is extracted from this object
*/
@Override
public void write(final byte[] key, final Entry value) {
write(value);
}
/**
* A record reader for time series.
*/
public final class TimeseriesTableRecordsReader extends IteratorBasedSplitReader<byte[], Entry> {
@Override
public Iterator<Entry> createIterator(final Split split) {
InputSplit s = (InputSplit) split;
return read(s.key, s.startTime, s.endTime, s.tags);
}
@Override
protected byte[] getKey(Entry entry) {
return entry.getKey();
}
}
/**
* Time series table entry.
*/
public static final class Entry extends TimeseriesDataset.Entry {
/**
* Creates instance of the time series entry.
*
* @param key key of the entry
* @param value value to store
* @param timestamp timestamp of the entry
* @param tags optional list of tags associated with the entry. See class description for more details.
*/
public Entry(byte[] key, byte[] value, long timestamp, byte[]... tags) {
super(key, value, timestamp, tags);
}
}
}