/* * Copyright © 2015-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.metadata; import co.cask.cdap.app.store.Store; import co.cask.cdap.common.NotFoundException; import co.cask.cdap.common.app.RunIds; import co.cask.cdap.common.entity.EntityExistenceVerifier; import co.cask.cdap.data2.metadata.lineage.Lineage; import co.cask.cdap.data2.metadata.lineage.LineageStore; import co.cask.cdap.data2.metadata.lineage.Relation; import co.cask.cdap.data2.metadata.store.MetadataStore; import co.cask.cdap.proto.Id; import co.cask.cdap.proto.metadata.MetadataRecord; import co.cask.cdap.proto.metadata.MetadataScope; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Predicate; import com.google.common.base.Predicates; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.inject.Inject; import org.apache.twill.api.RunId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.HashSet; import java.util.Set; import java.util.concurrent.TimeUnit; /** * Service to compute Lineage based on Dataset accesses of a Program stored in {@link LineageStore}. */ public class LineageAdmin { private static final Logger LOG = LoggerFactory.getLogger(LineageAdmin.class); private static final Function<Relation, Id.Program> RELATION_TO_PROGRAM_FUNCTION = new Function<Relation, Id.Program>() { @Override public Id.Program apply(Relation input) { return input.getProgram(); } }; private static final Function<Relation, Id.NamespacedId> RELATION_TO_DATA_FUNCTION = new Function<Relation, Id.NamespacedId>() { @Override public Id.NamespacedId apply(Relation input) { return input.getData(); } }; private final LineageStore lineageStore; private final Store store; private final MetadataStore metadataStore; private final EntityExistenceVerifier entityExistenceVerifier; @Inject LineageAdmin(LineageStore lineageStore, Store store, MetadataStore metadataStore, EntityExistenceVerifier entityExistenceVerifier) { this.lineageStore = lineageStore; this.store = store; this.metadataStore = metadataStore; this.entityExistenceVerifier = entityExistenceVerifier; } /** * Computes lineage for a dataset between given time period. * * @param sourceDataset dataset to compute lineage for * @param startMillis start time period * @param endMillis end time period * @param levels number of levels to compute lineage for * @return lineage for sourceDataset */ public Lineage computeLineage(final Id.DatasetInstance sourceDataset, long startMillis, long endMillis, int levels) throws NotFoundException { return doComputeLineage(sourceDataset, startMillis, endMillis, levels); } /** * Computes lineage for a stream between given time period. * * @param sourceStream stream to compute lineage for * @param startMillis start time period * @param endMillis end time period * @param levels number of levels to compute lineage for * @return lineage for sourceStream */ public Lineage computeLineage(final Id.Stream sourceStream, long startMillis, long endMillis, int levels) throws NotFoundException { return doComputeLineage(sourceStream, startMillis, endMillis, levels); } /** * @return metadata associated with a run */ public Set<MetadataRecord> getMetadataForRun(Id.Run run) throws NotFoundException { entityExistenceVerifier.ensureExists(run.toEntityId()); Set<Id.NamespacedId> runEntities = new HashSet<>(lineageStore.getEntitiesForRun(run)); // No entities associated with the run, but run exists. if (runEntities.isEmpty()) { return ImmutableSet.of(); } RunId runId = RunIds.fromString(run.getId()); // The entities returned by lineageStore does not contain application Id.Application application = run.getProgram().getApplication(); runEntities.add(application); return metadataStore.getSnapshotBeforeTime(MetadataScope.USER, runEntities, RunIds.getTime(runId, TimeUnit.MILLISECONDS)); } Lineage doComputeLineage(final Id.NamespacedId sourceData, long startMillis, long endMillis, int levels) throws NotFoundException { LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels); entityExistenceVerifier.ensureExists(sourceData.toEntityId()); // Convert start time and end time period into scan keys in terms of program start times. Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis)); if (LOG.isTraceEnabled()) { LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis); } ScanRangeWithFilter scanRange = getScanRange(runningInRange); LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd()); Set<Relation> relations = new HashSet<>(); Set<Id.NamespacedId> visitedDatasets = new HashSet<>(); Set<Id.NamespacedId> toVisitDatasets = new HashSet<>(); Set<Id.Program> visitedPrograms = new HashSet<>(); Set<Id.Program> toVisitPrograms = new HashSet<>(); toVisitDatasets.add(sourceData); for (int i = 0; i < levels; ++i) { LOG.trace("Level {}", i); toVisitPrograms.clear(); for (Id.NamespacedId d : toVisitDatasets) { if (!visitedDatasets.contains(d)) { LOG.trace("Visiting dataset {}", d); visitedDatasets.add(d); // Fetch related programs Iterable<Relation> programRelations = getProgramRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter()); LOG.trace("Got program relations {}", programRelations); Iterables.addAll(relations, programRelations); Iterables.addAll(toVisitPrograms, Iterables.transform(programRelations, RELATION_TO_PROGRAM_FUNCTION)); } } toVisitDatasets.clear(); for (Id.Program p : toVisitPrograms) { if (!visitedPrograms.contains(p)) { LOG.trace("Visiting program {}", p); visitedPrograms.add(p); // Fetch related datasets Iterable<Relation> datasetRelations = lineageStore.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter()); LOG.trace("Got data relations {}", datasetRelations); Iterables.addAll(relations, datasetRelations); Iterables.addAll(toVisitDatasets, Iterables.transform(datasetRelations, RELATION_TO_DATA_FUNCTION)); } } } Lineage lineage = new Lineage(relations); LOG.trace("Got lineage {}", lineage); return lineage; } private Iterable<Relation> getProgramRelations(Id.NamespacedId data, long start, long end, Predicate<Relation> filter) { if (data instanceof Id.DatasetInstance) { return lineageStore.getRelations((Id.DatasetInstance) data, start, end, filter); } if (data instanceof Id.Stream) { return lineageStore.getRelations((Id.Stream) data, start, end, filter); } throw new IllegalStateException("Unknown data type " + data); } /** * Convert a set of runIds into a scan range based on earliest runtime and latest runtime of runIds. * Also, add a scan filter to include only runIds in the given set. * @param runIds input runIds set * @return scan range */ @VisibleForTesting static ScanRangeWithFilter getScanRange(final Set<RunId> runIds) { if (runIds.isEmpty()) { return new ScanRangeWithFilter(0, 0, Predicates.<Relation>alwaysFalse()); } // Pick the earliest start time and latest start time for lineage range long earliest = Long.MAX_VALUE; long latest = 0; for (RunId runId : runIds) { long runStartTime = RunIds.getTime(runId, TimeUnit.MILLISECONDS); if (runStartTime < earliest) { earliest = runStartTime; } if (runStartTime > latest) { latest = runStartTime; } } // scan end key is exclusive, so need to add 1 to to include the last runid return new ScanRangeWithFilter(earliest, latest + 1, new Predicate<Relation>() { @Override public boolean apply(Relation input) { return runIds.contains(input.getRun()); } }); } @VisibleForTesting static class ScanRangeWithFilter { private final long start; private final long end; private final Predicate<Relation> filter; public ScanRangeWithFilter(long start, long end, Predicate<Relation> filter) { this.start = start; this.end = end; this.filter = filter; } public long getStart() { return start; } public long getEnd() { return end; } public Predicate<Relation> getFilter() { return filter; } } }