/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.dstream.tez; import java.net.URI; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; import java.util.Properties; import java.util.stream.Stream; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.tez.dag.api.DAG; import org.apache.tez.dag.api.DataSinkDescriptor; import org.apache.tez.dag.api.DataSourceDescriptor; import org.apache.tez.dag.api.Edge; import org.apache.tez.dag.api.ProcessorDescriptor; import org.apache.tez.dag.api.UserPayload; import org.apache.tez.dag.api.Vertex; import org.apache.tez.mapreduce.input.MRInput; import org.apache.tez.mapreduce.output.MROutput; import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import io.dstream.SerializableStreamAssets.SerSupplier; import io.dstream.support.SourceSupplier; import io.dstream.support.UriSourceSupplier; import io.dstream.tez.io.KeyWritable; import io.dstream.tez.io.TezDelegatingPartitioner; import io.dstream.tez.io.ValueWritable; import io.dstream.tez.utils.HdfsSerializerUtils; /** * */ public class TezDAGBuilder { private final Logger logger = LoggerFactory.getLogger(TezDAGBuilder.class); private final DAG dag; private final ExecutionContextAwareTezClient tezClient; private final OrderedPartitionedKVEdgeConfig edgeConf; private final TezDagExecutor dagExecutor; private Vertex lastVertex; private int inputOrderCounter; /** * * @param executionName * @param tezClient * @param executionConfig */ public TezDAGBuilder(String executionName, ExecutionContextAwareTezClient tezClient, Properties executionConfig) { this.dag = DAG.create(executionName + "_" + System.currentTimeMillis()); this.tezClient = tezClient; // TODO need to figure out when and why would the Edge be different and // how to configure it this.edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder("io.dstream.tez.io.KeyWritable", "io.dstream.tez.io.ValueWritable", TezDelegatingPartitioner.class.getName(), null).build(); this.dagExecutor = new TezDagExecutor(this.tezClient, this.dag); } /** * * @param taskDescriptor */ public void addTask(TaskDescriptor taskDescriptor) { if (taskDescriptor.getId() == 0) { this.determineInputFormatClass(taskDescriptor); } UserPayload payload = this.createPayloadFromTaskSerPath(Task.build(taskDescriptor), this.dag.getName()); ProcessorDescriptor pd = ProcessorDescriptor.create(TezTaskProcessor.class.getName()).setUserPayload(payload); SerSupplier<?> sourceSupplier = taskDescriptor.getSourceSupplier(); Vertex vertex = this.createVertex(taskDescriptor, pd); this.dag.addVertex(vertex); if (taskDescriptor.getId() == 0) { if (sourceSupplier instanceof UriSourceSupplier) { UriSourceSupplier uriSourceSupplier = (UriSourceSupplier) sourceSupplier; Stream<URI> uris = uriSourceSupplier.get(); DataSourceDescriptor dataSource = this.buildDataSourceDescriptorFromUris(taskDescriptor.getInputFormatClass(), uris); vertex.addDataSource(this.inputOrderCounter++ + ":" + vertex.getName() + "_INPUT_" + Arrays.asList(uris), dataSource); } } else { this.addEdge(vertex); } if (taskDescriptor.getDependentTasksChains() != null) { List<List<TaskDescriptor>> dependentTasksChains = taskDescriptor.getDependentTasksChains(); dependentTasksChains.forEach(dependentTasks -> { dependentTasks.forEach(this::addTask); this.addEdge(vertex); }); } if (logger.isDebugEnabled()) { logger.debug("Created Vertex: " + vertex); } this.lastVertex = vertex; } /** * */ private Vertex createVertex(TaskDescriptor taskDescriptor, ProcessorDescriptor pd) { String vertexName = taskDescriptor.getName() + "_" + taskDescriptor.getOperationName(); Vertex vertex = (taskDescriptor.getId() == 0 && taskDescriptor.getSourceSupplier() instanceof UriSourceSupplier) ? Vertex.create(this.inputOrderCounter++ + ":" + vertexName, pd) : Vertex.create(this.inputOrderCounter++ + ":" + vertexName, pd, taskDescriptor.getParallelism()); vertex.addTaskLocalFiles(this.tezClient.getLocalResources()); return vertex; } /** * * @param vertex */ private void addEdge(Vertex vertex) { Edge edge = Edge.create(this.lastVertex, vertex, this.edgeConf.createDefaultEdgeProperty()); this.dag.addEdge(edge); } /** * */ public void addDataSink(String outputPath) { this.createDataSink(this.lastVertex, this.tezClient.getClientName() + "_OUTPUT", KeyWritable.class, ValueWritable.class, SequenceFileOutputFormat.class, outputPath); this.lastVertex = null; } /** * * @return */ public Runnable build() { return this.dagExecutor; } /** * */ private DataSourceDescriptor buildDataSourceDescriptorFromUris(Class<?> inputFormatClass, Stream<URI> sources) { String inputPath = sources.map(uri -> uri.getPath()).reduce((a,b) -> a + "," + b).get(); return MRInput.createConfigBuilder(this.tezClient.getTezConfiguration(), inputFormatClass, inputPath) .groupSplits(false).build(); } /** * */ private UserPayload createPayloadFromTaskSerPath(Task task, String dagName) { org.apache.hadoop.fs.Path mapTaskPath = HdfsSerializerUtils.serialize(task, this.tezClient.getFileSystem(), new org.apache.hadoop.fs.Path(dagName + "/tasks/" + task.getId() + "_" + task.getName() + ".ser")); return UserPayload.create(ByteBuffer.wrap(mapTaskPath.toString().getBytes())); } /** * */ private void createDataSink(Vertex vertex, String name, Class<? extends Writable> keyClass, Class<? extends Writable> valueClass, Class<?> outputFormatClass, String outputPath) { JobConf dsConfig = this.buildJobConf(keyClass, valueClass); DataSinkDescriptor dataSink = MROutput.createConfigBuilder(dsConfig, outputFormatClass, outputPath).build(); vertex.addDataSink(name, dataSink); } /** * */ private JobConf buildJobConf(Class<? extends Writable> keyClass, Class<? extends Writable> valueClass) { JobConf jobConf = new JobConf(this.tezClient.getTezConfiguration()); jobConf.setOutputKeyClass(keyClass); jobConf.setOutputValueClass(valueClass); return jobConf; } /** * */ private void determineInputFormatClass(TaskDescriptor firstTask) { SourceSupplier<?> sourceSupplier = (SourceSupplier<?>) firstTask.getSourceSupplier(); Class<?> sourceElementType = firstTask.getSourceElementType(); if (sourceSupplier instanceof UriSourceSupplier) { if (sourceElementType.isAssignableFrom(String.class)) { firstTask.setInputFormatClass(TextInputFormat.class); } else { // TODO design a configurable component to handle other standard // and custom input types throw new IllegalArgumentException("Failed to determine Input Format class for source item type " + sourceElementType); } } // else { // throw new IllegalArgumentException("Non URI sources are not supported yet"); // } } }