/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2.lib.timeseries; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.dataset.lib.cube.DimensionValue; import co.cask.cdap.api.dataset.lib.cube.MeasureType; import co.cask.cdap.api.dataset.lib.cube.Measurement; import co.cask.cdap.api.dataset.table.Row; import co.cask.cdap.api.dataset.table.Scanner; import co.cask.cdap.api.metrics.MetricsCollector; import co.cask.cdap.common.utils.ImmutablePair; import co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter; import co.cask.cdap.data2.dataset2.lib.table.MetricsTable; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.Set; import javax.annotation.Nullable; /** * Table for storing {@link Fact}s. * * Thread safe as long as the passed into the constructor datasets are thread safe (usually is not the case). */ public final class FactTable implements Closeable { private static final Logger LOG = LoggerFactory.getLogger(FactTable.class); private static final int MAX_ROLL_TIME = 0xfffe; // hard limits on some ops to stay on safe side private static final int MAX_RECORDS_TO_SCAN_DURING_SEARCH = 10 * 1000 * 1000; private static final int MAX_SCANS_DURING_SEARCH = 10 * 1000; private static final Function<byte[], Long> BYTES_TO_LONG = new Function<byte[], Long>() { @Override public Long apply(byte[] input) { return Bytes.toLong(input); } }; private static final Function<NavigableMap<byte[], byte[]>, NavigableMap<byte[], Long>> TRANSFORM_MAP_BYTE_ARRAY_TO_LONG = new Function<NavigableMap<byte[], byte[]>, NavigableMap<byte[], Long>>() { @Override public NavigableMap<byte[], Long> apply(NavigableMap<byte[], byte[]> input) { return Maps.transformValues(input, BYTES_TO_LONG); } }; private final MetricsTable timeSeriesTable; private final EntityTable entityTable; private final FactCodec codec; private final int resolution; // todo: should not be used outside of codec private final int rollTime; private final String putCountMetric; private final String incrementCountMetric; @Nullable private MetricsCollector metrics; /** * Creates an instance of {@link FactTable}. * * @param timeSeriesTable A table for storing facts information. * @param entityTable The table for storing dimension encoding mappings. * @param resolution Resolution in seconds * @param rollTime Number of resolution for writing to a new row with a new timebase. * Meaning the differences between timebase of two consecutive rows divided by * resolution seconds. It essentially defines how many columns per row in the table. * This value should be < 65535. */ public FactTable(MetricsTable timeSeriesTable, EntityTable entityTable, int resolution, int rollTime) { // Two bytes for column name, which is a delta timestamp Preconditions.checkArgument(rollTime <= MAX_ROLL_TIME, "Rolltime should be <= " + MAX_ROLL_TIME); this.entityTable = entityTable; this.timeSeriesTable = timeSeriesTable; this.codec = new FactCodec(entityTable, resolution, rollTime); this.resolution = resolution; this.rollTime = rollTime; this.putCountMetric = "factTable." + resolution + ".put.count"; this.incrementCountMetric = "factTable." + resolution + ".increment.count"; } public void setMetricsCollector(MetricsCollector metrics) { this.metrics = metrics; } public void add(List<Fact> facts) { // Simply collecting all rows/cols/values that need to be put to the underlying table. NavigableMap<byte[], NavigableMap<byte[], byte[]>> gaugesTable = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); NavigableMap<byte[], NavigableMap<byte[], byte[]>> incrementsTable = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); for (Fact fact : facts) { for (Measurement measurement : fact.getMeasurements()) { byte[] rowKey = codec.createRowKey(fact.getDimensionValues(), measurement.getName(), fact.getTimestamp()); byte[] column = codec.createColumn(fact.getTimestamp()); if (MeasureType.COUNTER == measurement.getType()) { inc(incrementsTable, rowKey, column, measurement.getValue()); } else { set(gaugesTable, rowKey, column, Bytes.toBytes(measurement.getValue())); } } } NavigableMap<byte[], NavigableMap<byte[], Long>> convertedIncrementsTable = Maps.transformValues(incrementsTable, TRANSFORM_MAP_BYTE_ARRAY_TO_LONG); NavigableMap<byte[], NavigableMap<byte[], Long>> convertedGaugesTable = Maps.transformValues(gaugesTable, TRANSFORM_MAP_BYTE_ARRAY_TO_LONG); // todo: replace with single call, to be able to optimize rpcs in underlying table timeSeriesTable.put(convertedGaugesTable); timeSeriesTable.increment(convertedIncrementsTable); if (metrics != null) { metrics.increment(putCountMetric, convertedGaugesTable.size()); metrics.increment(incrementCountMetric, convertedIncrementsTable.size()); } } public FactScanner scan(FactScan scan) { return new FactScanner(getScanner(scan), codec, scan.getStartTs(), scan.getEndTs(), scan.getMeasureNames()); } private Scanner getScanner(FactScan scan) { // use null if no metrics or more than one metrics are provided in the scan String measureName = scan.getMeasureNames().size() == 1 ? scan.getMeasureNames().iterator().next() : null; byte[] startRow = codec.createStartRowKey(scan.getDimensionValues(), measureName, scan.getStartTs(), false); byte[] endRow = codec.createEndRowKey(scan.getDimensionValues(), measureName, scan.getEndTs(), false); byte[][] columns; if (Arrays.equals(startRow, endRow)) { // If on the same timebase, we only need subset of columns long timeBase = scan.getStartTs() / rollTime * rollTime; int startCol = (int) (scan.getStartTs() - timeBase) / resolution; int endCol = (int) (scan.getEndTs() - timeBase) / resolution; columns = new byte[endCol - startCol + 1][]; for (int i = 0; i < columns.length; i++) { columns[i] = Bytes.toBytes((short) (startCol + i)); } } endRow = Bytes.stopKeyForPrefix(endRow); FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(scan, startRow); if (LOG.isTraceEnabled()) { LOG.trace("Scanning fact table {} with scan: {}; constructed startRow: {}, endRow: {}, fuzzyRowFilter: {}", timeSeriesTable, scan, toPrettyLog(startRow), toPrettyLog(endRow), fuzzyRowFilter); } return timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter); } /** * Delete entries in fact table. * @param scan specifies deletion criteria */ public void delete(FactScan scan) { Scanner scanner = getScanner(scan); try { Row row; while ((row = scanner.next()) != null) { List<byte[]> columns = Lists.newArrayList(); boolean exhausted = false; for (byte[] column : row.getColumns().keySet()) { long ts = codec.getTimestamp(row.getRow(), column); if (ts < scan.getStartTs()) { continue; } if (ts > scan.getEndTs()) { exhausted = true; break; } columns.add(column); } // todo: do deletes efficiently, in batches, not one-by-one timeSeriesTable.delete(row.getRow(), columns.toArray(new byte[columns.size()][])); if (exhausted) { break; } } } finally { scanner.close(); } } /** * Searches for first non-null valued dimensions in records that contain given list of dimensions and match given * dimension values in given time range. Returned dimension values are those that are not defined in given * dimension values. * @param allDimensionNames list of all dimension names to be present in the record * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value. * @param startTs start of the time range, in seconds * @param endTs end of the time range, in seconds * @return {@link Set} of {@link DimensionValue}s */ // todo: pass a limit on number of dimensionValues returned // todo: kinda not cool API when we expect null values in a map... public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) { // Algorithm, briefly: // We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan. // If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice, // we use first of such dimension as a value to return. // When we find value to return, since we only fill a single dimension, we are not interested in drilling down // further and instead attempt to fast-forward (jump) to a record that has different value in that dimension. // Thus we find all results. List<DimensionValue> allDimensions = Lists.newArrayList(); List<DimensionValue> filledDimension = Lists.newArrayList(); List<Integer> dimToFillIndexes = Lists.newArrayList(); for (int i = 0; i < allDimensionNames.size(); i++) { String dimensionName = allDimensionNames.get(i); if (!dimensionSlice.containsKey(dimensionName)) { dimToFillIndexes.add(i); allDimensions.add(new DimensionValue(dimensionName, null)); } else { DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName)); filledDimension.add(dimensionValue); allDimensions.add(dimensionValue); } } // If provided dimensions contain all values filled in, there's nothing to look for if (dimToFillIndexes.isEmpty()) { return Collections.emptySet(); } Set<DimensionValue> result = Sets.newHashSet(); int scans = 0; int scannedRecords = 0; // build a scan byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false); byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false); endRow = Bytes.stopKeyForPrefix(endRow); FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow); Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter); scans++; try { Row rowResult; while ((rowResult = scanner.next()) != null) { scannedRecords++; // todo: make configurable if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) { break; } byte[] rowKey = rowResult.getRow(); // filter out columns by time range (scan configuration only filters whole rows) if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) { continue; } if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) { // we're done with scanner break; } List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow()); // At this point, we know that the record is in right time range and its dimensions matches given. // We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form // next drill down suggestion int filledIndex = -1; for (int index : dimToFillIndexes) { // todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is // not fast DimensionValue dimensionValue = dimensionValues.get(index); if (dimensionValue.getValue() != null) { result.add(dimensionValue); filledIndex = index; break; } } // Ss soon as we find dimension to fill, we are not interested into drilling down further (by contract, we fill // single dimension value). Thus, we try to jump to the record that has greater value in that dimension. // todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421) if (filledIndex >= 0) { scanner.close(); scanner = null; scans++; if (scans > MAX_SCANS_DURING_SEARCH) { break; } startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex); scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter); } } } finally { if (scanner != null) { scanner.close(); } } LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords); return result; } /** * Finds all measure names of the facts that match given {@link DimensionValue}s and time range. * @param allDimensionNames list of all dimension names to be present in the fact record * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value. * @param startTs start timestamp, in sec * @param endTs end timestamp, in sec * @return {@link Set} of measure names */ // todo: pass a limit on number of measures returned public Set<String> findMeasureNames(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) { List<DimensionValue> allDimensions = Lists.newArrayList(); for (String dimensionName : allDimensionNames) { allDimensions.add(new DimensionValue(dimensionName, dimensionSlice.get(dimensionName))); } byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false); byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false); endRow = Bytes.stopKeyForPrefix(endRow); FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow); Set<String> measureNames = Sets.newHashSet(); int scannedRecords = 0; // todo: make configurable Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter); try { Row rowResult; while ((rowResult = scanner.next()) != null) { scannedRecords++; if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) { break; } byte[] rowKey = rowResult.getRow(); // filter out columns by time range (scan configuration only filters whole rows) if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) { continue; } if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) { // we're done with scanner break; } measureNames.add(codec.getMeasureName(rowResult.getRow())); } } finally { scanner.close(); } LOG.trace("search for metrics completed, scanned records: {}", scannedRecords); return measureNames; } @Override public void close() throws IOException { timeSeriesTable.close(); entityTable.close(); } public static byte[][] getSplits(int aggGroupsCount) { return FactCodec.getSplits(aggGroupsCount); } @Nullable private FuzzyRowFilter createFuzzyRowFilter(FactScan scan, byte[] startRow) { // we need to always use a fuzzy row filter as it is the only one to do the matching of values // if we are querying only one metric, we will use fixed metricName for filter, // if there are no metrics or more than one metrics to query we use `ANY` fuzzy filter. String measureName = (scan.getMeasureNames().size() == 1) ? scan.getMeasureNames().iterator().next() : null; byte[] fuzzyRowMask = codec.createFuzzyRowMask(scan.getDimensionValues(), measureName); // note: we can use startRow, as it will contain all "fixed" parts of the key needed return new FuzzyRowFilter(ImmutableList.of(new ImmutablePair<>(startRow, fuzzyRowMask))); } // todo: shouldn't we aggregate "before" writing to FactTable? We could do it really efficient outside // also: the underlying datasets will do aggregation in memory anyways private static void inc(NavigableMap<byte[], NavigableMap<byte[], byte[]>> incrementsTable, byte[] rowKey, byte[] column, long value) { byte[] oldValue = get(incrementsTable, rowKey, column); long newValue = value; if (oldValue != null) { if (Bytes.SIZEOF_LONG == oldValue.length) { newValue = Bytes.toLong(oldValue) + value; } else if (Bytes.SIZEOF_INT == oldValue.length) { // In 2.4 and older versions we stored it as int newValue = Bytes.toInt(oldValue) + value; } else { // should NEVER happen, unless the table is screwed up manually throw new IllegalStateException( String.format("Could not parse measure @row %s @column %s value %s as int or long", Bytes.toStringBinary(rowKey), Bytes.toStringBinary(column), Bytes.toStringBinary(oldValue))); } } set(incrementsTable, rowKey, column, Bytes.toBytes(newValue)); } private static byte[] get(NavigableMap<byte[], NavigableMap<byte[], byte[]>> table, byte[] row, byte[] column) { NavigableMap<byte[], byte[]> rowMap = table.get(row); return rowMap == null ? null : rowMap.get(column); } private static void set(NavigableMap<byte[], NavigableMap<byte[], byte[]>> table, byte[] row, byte[] column, byte[] value) { NavigableMap<byte[], byte[]> rowMap = table.get(row); if (rowMap == null) { rowMap = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); table.put(row, rowMap); } rowMap.put(column, value); } private String toPrettyLog(byte[] key) { StringBuilder sb = new StringBuilder("{"); for (byte b : key) { String enc = String.valueOf((int) b) + " "; sb.append(enc.substring(0, 5)); } sb.append("}"); return sb.toString(); } }