/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data2.metadata.lineage;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.dataset.lib.AbstractDataset;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.dataset.table.Scanner;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.common.app.RunIds;
import co.cask.cdap.data2.dataset2.lib.table.MDSKey;
import co.cask.cdap.proto.Id;
import co.cask.cdap.proto.ProgramType;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import org.apache.twill.api.RunId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;
/**
* Dataset to store/retrieve Dataset accesses of a Program.
*/
public class LineageDataset extends AbstractDataset {
// Storage format for row keys
// ---------------------------
//
// Dataset access from program:
// -------------------------------------------------------------------------------
// | d | <id.dataset> | <inverted-start-time> | p | <id.run> | <access-type> |
// -------------------------------------------------------------------------------
// | p | <id.run> | <inverted-start-time> | p | <id.dataset> | <access-type> |
// -------------------------------------------------------------------------------
//
// Stream access from program:
// -------------------------------------------------------------------------------
// | s | <id.stream> | <inverted-start-time> | p | <id.run> | <access-type> |
// -------------------------------------------------------------------------------
// | p | <id.run> | <inverted-start-time> | s | <id.stream> | <access-type> |
// -------------------------------------------------------------------------------
private static final Logger LOG = LoggerFactory.getLogger(LineageDataset.class);
// Column used to store access time
private static final byte[] ACCESS_TIME_COLS_BYTE = {'t'};
private static final char DATASET_MARKER = 'd';
private static final char PROGRAM_MARKER = 'p';
private static final char FLOWLET_MARKER = 'f';
private static final char STREAM_MARKER = 's';
private static final char NONE_MARKER = '0';
private Table accessRegistryTable;
public LineageDataset(String instanceName, Table accessRegistryTable) {
super(instanceName, accessRegistryTable);
this.accessRegistryTable = accessRegistryTable;
}
/**
* Add a program-dataset access.
*
* @param run program run information
* @param datasetInstance dataset accessed by the program
* @param accessType access type
* @param accessTimeMillis time of access
*/
public void addAccess(Id.Run run, Id.DatasetInstance datasetInstance,
AccessType accessType, long accessTimeMillis) {
addAccess(run, datasetInstance, accessType, accessTimeMillis, null);
}
/**
* Add a program-dataset access.
*
* @param run program run information
* @param datasetInstance dataset accessed by the program
* @param accessType access type
* @param accessTimeMillis time of access
* @param component program component such as flowlet id, etc.
*/
public void addAccess(Id.Run run, Id.DatasetInstance datasetInstance,
AccessType accessType, long accessTimeMillis, @Nullable Id.NamespacedId component) {
LOG.trace("Recording access run={}, dataset={}, accessType={}, accessTime={}, component={}",
run, datasetInstance, accessType, accessTimeMillis, component);
accessRegistryTable.put(getDatasetKey(datasetInstance, run, accessType, component),
ACCESS_TIME_COLS_BYTE, Bytes.toBytes(accessTimeMillis));
accessRegistryTable.put(getProgramKey(run, datasetInstance, accessType, component),
ACCESS_TIME_COLS_BYTE, Bytes.toBytes(accessTimeMillis));
}
/**
* Add a program-stream access.
*
* @param run program run information
* @param stream stream accessed by the program
* @param accessType access type
* @param accessTimeMillis time of access
*/
public void addAccess(Id.Run run, Id.Stream stream,
AccessType accessType, long accessTimeMillis) {
addAccess(run, stream, accessType, accessTimeMillis, null);
}
/**
* Add a program-stream access.
*
* @param run program run information
* @param stream stream accessed by the program
* @param accessType access type
* @param accessTimeMillis time of access
* @param component program component such as flowlet id, etc.
*/
public void addAccess(Id.Run run, Id.Stream stream,
AccessType accessType, long accessTimeMillis, @Nullable Id.NamespacedId component) {
LOG.trace("Recording access run={}, stream={}, accessType={}, accessTime={}, component={}",
run, stream, accessType, accessTimeMillis, component);
accessRegistryTable.put(getStreamKey(stream, run, accessType, component),
ACCESS_TIME_COLS_BYTE, Bytes.toBytes(accessTimeMillis));
accessRegistryTable.put(getProgramKey(run, stream, accessType, component),
ACCESS_TIME_COLS_BYTE, Bytes.toBytes(accessTimeMillis));
}
/**
* @return a set of entities (program and data it accesses) associated with a program run.
*/
public Set<Id.NamespacedId> getEntitiesForRun(Id.Run run) {
ImmutableSet.Builder<Id.NamespacedId> recordBuilder = ImmutableSet.builder();
byte[] startKey = getRunScanStartKey(run);
Scanner scanner = accessRegistryTable.scan(startKey, Bytes.stopKeyForPrefix(startKey));
try {
Row row;
while ((row = scanner.next()) != null) {
if (LOG.isTraceEnabled()) {
LOG.trace("Got row key = {}", Bytes.toString(row.getRow()));
}
RowKey rowKey = parseRow(row);
if (run.getId().equals(rowKey.getRunId().getId())) {
recordBuilder.add(rowKey.getProgram());
recordBuilder.add(rowKey.getData());
}
}
} finally {
scanner.close();
}
return recordBuilder.build();
}
/**
* Fetch program-dataset access information for a dataset for a given period.
*
* @param datasetInstance dataset for which to fetch access information
* @param start start time period
* @param end end time period
* @param filter filter to be applied on result set
* @return program-dataset access information
*/
public Set<Relation> getRelations(Id.DatasetInstance datasetInstance, long start, long end,
Predicate<Relation> filter) {
return scanRelations(getDatasetScanStartKey(datasetInstance, end),
getDatasetScanEndKey(datasetInstance, start),
filter);
}
/**
* Fetch program-stream access information for a dataset for a given period.
*
* @param stream stream for which to fetch access information
* @param start start time period
* @param end end time period
* @param filter filter to be applied on result set
* @return program-dataset access information
*/
public Set<Relation> getRelations(Id.Stream stream, long start, long end, Predicate<Relation> filter) {
return scanRelations(getStreamScanStartKey(stream, end),
getStreamScanEndKey(stream, start),
filter);
}
/**
* Fetch program-dataset access information for a program for a given period.
*
* @param program program for which to fetch access information
* @param start start time period
* @param end end time period
* @param filter filter to be applied on result set
* @return program-dataset access information
*/
public Set<Relation> getRelations(Id.Program program, long start, long end, Predicate<Relation> filter) {
return scanRelations(getProgramScanStartKey(program, end),
getProgramScanEndKey(program, start),
filter);
}
/**
* @return a set of access times (for program and data it accesses) associated with a program run.
*/
@VisibleForTesting
public List<Long> getAccessTimesForRun(Id.Run run) {
ImmutableList.Builder<Long> recordBuilder = ImmutableList.builder();
byte[] startKey = getRunScanStartKey(run);
Scanner scanner = accessRegistryTable.scan(startKey, Bytes.stopKeyForPrefix(startKey));
try {
Row row;
while ((row = scanner.next()) != null) {
if (LOG.isTraceEnabled()) {
LOG.trace("Got row key = {}", Bytes.toString(row.getRow()));
}
RowKey rowKey = parseRow(row);
if (run.getId().equals(rowKey.getRunId().getId())) {
recordBuilder.add(Bytes.toLong(row.get(ACCESS_TIME_COLS_BYTE)));
}
}
} finally {
scanner.close();
}
return recordBuilder.build();
}
private Set<Relation> scanRelations(byte[] startKey, byte[] endKey, Predicate<Relation> filter) {
ImmutableSet.Builder<Relation> relationsBuilder = ImmutableSet.builder();
Scanner scanner = accessRegistryTable.scan(startKey, endKey);
try {
Row row;
while ((row = scanner.next()) != null) {
if (LOG.isTraceEnabled()) {
LOG.trace("Got row key = {}", Bytes.toString(row.getRow()));
}
Relation relation = toRelation(row);
if (filter.apply(relation)) {
relationsBuilder.add(relation);
}
}
} finally {
scanner.close();
}
return relationsBuilder.build();
}
private byte[] getDatasetKey(Id.DatasetInstance datasetInstance, Id.Run run,
AccessType accessType, @Nullable Id.NamespacedId component) {
MDSKey.Builder builder = new MDSKey.Builder();
addDataset(builder, datasetInstance);
addDataKey(builder, run, accessType, component);
return builder.build().getKey();
}
private byte[] getStreamKey(Id.Stream stream, Id.Run run,
AccessType accessType, @Nullable Id.NamespacedId component) {
MDSKey.Builder builder = new MDSKey.Builder();
addStream(builder, stream);
addDataKey(builder, run, accessType, component);
return builder.build().getKey();
}
private void addDataKey(MDSKey.Builder builder, Id.Run run,
AccessType accessType, @Nullable Id.NamespacedId component) {
long invertedStartTime = getInvertedStartTime(run);
builder.add(invertedStartTime);
addProgram(builder, run.getProgram());
builder.add(run.getId());
builder.add(accessType.getType());
addComponent(builder, component);
}
private byte[] getProgramKey(Id.Run run, Id.DatasetInstance datasetInstance,
AccessType accessType, @Nullable Id.NamespacedId component) {
long invertedStartTime = getInvertedStartTime(run);
MDSKey.Builder builder = new MDSKey.Builder();
addProgram(builder, run.getProgram());
builder.add(invertedStartTime);
addDataset(builder, datasetInstance);
builder.add(run.getId());
builder.add(accessType.getType());
addComponent(builder, component);
return builder.build().getKey();
}
private byte[] getProgramKey(Id.Run run, Id.Stream stream,
AccessType accessType, @Nullable Id.NamespacedId component) {
long invertedStartTime = getInvertedStartTime(run);
MDSKey.Builder builder = new MDSKey.Builder();
addProgram(builder, run.getProgram());
builder.add(invertedStartTime);
addStream(builder, stream);
builder.add(run.getId());
builder.add(accessType.getType());
addComponent(builder, component);
return builder.build().getKey();
}
private RowKey parseRow(Row row) {
Id.Program program;
Id.NamespacedId data;
RunId runId;
MDSKey.Splitter splitter = new MDSKey(row.getRow()).split();
char marker = (char) splitter.getInt();
LOG.trace("Got marker {}", marker);
switch (marker) {
case PROGRAM_MARKER:
program = (Id.Program) toId(splitter, marker);
splitter.skipLong(); // inverted start time
marker = (char) splitter.getInt();
data = toId(splitter, marker); // data
runId = RunIds.fromString(splitter.getString());
return new RowKey(program, data, runId);
case DATASET_MARKER:
case STREAM_MARKER:
data = toId(splitter, marker);
splitter.skipLong(); // inverted start time
marker = (char) splitter.getInt();
program = (Id.Program) toId(splitter, marker); // program
runId = RunIds.fromString(splitter.getString());
return new RowKey(program, data, runId);
default:
throw new IllegalStateException("Invalid row with marker " + marker);
}
}
private byte[] getDatasetScanKey(Id.DatasetInstance datasetInstance, long time) {
long invertedStartTime = invertTime(time);
MDSKey.Builder builder = new MDSKey.Builder();
addDataset(builder, datasetInstance);
builder.add(invertedStartTime);
return builder.build().getKey();
}
private byte[] getDatasetScanStartKey(Id.DatasetInstance datasetInstance, long end) {
// time is inverted, hence we need to have end time in start key.
// Since end time is exclusive, add 1 to make it inclusive.
return getDatasetScanKey(datasetInstance, end + 1);
}
private byte[] getDatasetScanEndKey(Id.DatasetInstance datasetInstance, long start) {
// time is inverted, hence we need to have start time in end key.
// Since start time is inclusive, subtract 1 to make it exclusive.
return getDatasetScanKey(datasetInstance, start - 1);
}
private byte[] getStreamScanKey(Id.Stream stream, long time) {
long invertedStartTime = invertTime(time);
MDSKey.Builder builder = new MDSKey.Builder();
addStream(builder, stream);
builder.add(invertedStartTime);
return builder.build().getKey();
}
private byte[] getStreamScanStartKey(Id.Stream stream, long end) {
// time is inverted, hence we need to have end time in start key.
// Since end time is exclusive, add 1 to make it inclusive.
return getStreamScanKey(stream, end + 1);
}
private byte[] getStreamScanEndKey(Id.Stream stream, long start) {
// time is inverted, hence we need to have start time in end key.
// Since start time is inclusive, subtract 1 to make it exclusive.
return getStreamScanKey(stream, start - 1);
}
private byte[] getProgramScanKey(Id.Program program, long time) {
long invertedStartTime = invertTime(time);
MDSKey.Builder builder = new MDSKey.Builder();
addProgram(builder, program);
builder.add(invertedStartTime);
return builder.build().getKey();
}
private byte[] getProgramScanStartKey(Id.Program program, long end) {
// time is inverted, hence we need to have end time in start key.
// Since end time is exclusive, add 1 to make it inclusive.
return getProgramScanKey(program, end + 1);
}
private byte[] getProgramScanEndKey(Id.Program program, long start) {
// time is inverted, hence we need to have start time in end key.
// Since start time is inclusive, subtract 1 to make it exclusive.
return getProgramScanKey(program, start - 1);
}
private byte[] getRunScanStartKey(Id.Run run) {
MDSKey.Builder builder = new MDSKey.Builder();
addProgram(builder, run.getProgram());
builder.add(getInvertedStartTime(run));
return builder.build().getKey();
}
private void addDataset(MDSKey.Builder keyBuilder, Id.DatasetInstance datasetInstance) {
keyBuilder.add(DATASET_MARKER)
.add(datasetInstance.getNamespaceId())
.add(datasetInstance.getId());
}
private void addStream(MDSKey.Builder keyBuilder, Id.Stream stream) {
keyBuilder.add(STREAM_MARKER)
.add(stream.getNamespaceId())
.add(stream.getId());
}
private void addProgram(MDSKey.Builder keyBuilder, Id.Program program) {
keyBuilder.add(PROGRAM_MARKER)
.add(program.getNamespaceId())
.add(program.getApplicationId())
.add(program.getType().getCategoryName())
.add(program.getId());
}
private void addComponent(MDSKey.Builder keyBuilder, Id component) {
if (component instanceof Id.Flow.Flowlet) {
keyBuilder.add(FLOWLET_MARKER)
.add(component.getId());
} else {
keyBuilder.add(NONE_MARKER);
}
}
private Id.NamespacedId toId(MDSKey.Splitter splitter, char marker) {
switch (marker) {
case DATASET_MARKER:
return Id.DatasetInstance.from(splitter.getString(), splitter.getString());
case STREAM_MARKER:
return Id.Stream.from(splitter.getString(), splitter.getString());
case PROGRAM_MARKER:
return Id.Program.from(splitter.getString(), splitter.getString(),
ProgramType.valueOfCategoryName(splitter.getString()),
splitter.getString());
default: throw new IllegalStateException("Invalid row with marker " + marker);
}
}
private Id.NamespacedId toComponent(MDSKey.Splitter splitter, Id.Program program) {
char marker = (char) splitter.getInt();
switch (marker) {
case NONE_MARKER:
return null;
case FLOWLET_MARKER :
return Id.Flow.Flowlet.from(program.getApplication(), program.getId(), splitter.getString());
default:
throw new IllegalStateException("Invalid row with component marker " + marker);
}
}
private long invertTime(long time) {
return Long.MAX_VALUE - time;
}
private long getInvertedStartTime(Id.Run run) {
return invertTime(RunIds.getTime(RunIds.fromString(run.getId()), TimeUnit.MILLISECONDS));
}
private Relation toRelation(Row row) {
Map<Character, Id> rowInfo = new HashMap<>(4);
MDSKey.Splitter splitter = new MDSKey(row.getRow()).split();
char marker = (char) splitter.getInt();
LOG.trace("Got marker {}", marker);
Id id1 = toId(splitter, marker);
LOG.trace("Got id1 {}", id1);
rowInfo.put(marker, id1);
splitter.skipLong(); // inverted time - not required for relation
marker = (char) splitter.getInt();
LOG.trace("Got marker {}", marker);
Id id2 = toId(splitter, marker);
LOG.trace("Got id2 {}", id1);
rowInfo.put(marker, id2);
RunId runId = RunIds.fromString(splitter.getString());
LOG.trace("Got runId {}", runId);
AccessType accessType = AccessType.fromType((char) splitter.getInt());
LOG.trace("Got access type {}", accessType);
Id.DatasetInstance datasetInstance = (Id.DatasetInstance) rowInfo.get(DATASET_MARKER);
LOG.trace("Got datasetInstance {}", datasetInstance);
Id.Stream stream = (Id.Stream) rowInfo.get(STREAM_MARKER);
LOG.trace("Got stream {}", stream);
Id.Program program = (Id.Program) rowInfo.get(PROGRAM_MARKER);
LOG.trace("Got program {}", program);
Id.NamespacedId component = toComponent(splitter, program);
LOG.trace("Got component {}", component);
if (stream == null) {
return new Relation(datasetInstance,
program,
accessType,
runId,
component == null ? ImmutableSet.<Id.NamespacedId>of() : ImmutableSet.of(component));
}
return new Relation(stream,
program,
accessType,
runId,
component == null ? ImmutableSet.<Id.NamespacedId>of() : ImmutableSet.of(component));
}
private static final class RowKey {
private final Id.Program program;
private final Id.NamespacedId data;
private final RunId runId;
public RowKey(Id.Program program, Id.NamespacedId data, RunId runId) {
this.program = program;
this.data = data;
this.runId = runId;
}
public Id.Program getProgram() {
return program;
}
public Id.NamespacedId getData() {
return data;
}
public RunId getRunId() {
return runId;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof RowKey)) {
return false;
}
RowKey rowKey = (RowKey) o;
return Objects.equals(program, rowKey.program) &&
Objects.equals(data, rowKey.data) &&
Objects.equals(runId, rowKey.runId);
}
@Override
public int hashCode() {
return Objects.hash(program, data, runId);
}
@Override
public String toString() {
return "RowKey{" +
"program=" + program +
", data=" + data +
", runId=" + runId +
'}';
}
}
}