/** * Copyright 2011 LiveRamp * <p/> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.liveramp.hank.cascading; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Properties; import org.apache.hadoop.mapred.JobConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import cascading.cascade.Cascades; import cascading.flow.Flow; import cascading.flow.FlowStepListener; import cascading.pipe.Pipe; import cascading.tap.Tap; import com.liveramp.hank.coordinator.DomainVersionProperties; import com.liveramp.hank.hadoop.DomainBuilderOutputCommitter; import com.liveramp.hank.hadoop.DomainBuilderProperties; import com.liveramp.hank.hadoop.DomainVersionNumberAndNumPartitions; public class CascadingDomainBuilder { private final static Logger LOG = LoggerFactory.getLogger(CascadingDomainBuilder.class); private Tap outputTap = null; private final DomainBuilderProperties properties; private final DomainVersionProperties domainVersionProperties; private Pipe pipe; private final String keyFieldName; private final String valueFieldName; private Integer partitionToBuild = null; private Integer domainVersionNumber = null; private Integer numPartitions = null; public CascadingDomainBuilder(DomainBuilderProperties properties, DomainVersionProperties domainVersionProperties, Pipe pipe, String keyFieldName, String valueFieldName) throws IOException { this.properties = properties; this.domainVersionProperties = domainVersionProperties; this.pipe = pipe; this.keyFieldName = keyFieldName; this.valueFieldName = valueFieldName; } public void openNewVersion() throws IOException { DomainVersionNumberAndNumPartitions domainVersionNumberAndNumPartitions = properties.openVersion(domainVersionProperties); domainVersionNumber = domainVersionNumberAndNumPartitions.getDomainVersionNumber(); numPartitions = domainVersionNumberAndNumPartitions.getNumPartitions(); // Create Tap outputTap = new DomainBuilderTap(keyFieldName, valueFieldName, domainVersionNumber, properties); } public void cancelNewVersion() throws IOException { properties.cancelVersion(domainVersionNumber); } public void closeNewVersion() throws IOException { properties.closeVersion(domainVersionNumber); } public void setPartitionToBuild(int partitionToBuild) { this.partitionToBuild = partitionToBuild; } // Build a single domain using one source public Flow build(FlowStepListener listener, Properties cascadingProperties, String sourcePipeName, Tap source) throws IOException { return build(listener, cascadingProperties, Cascades.tapsMap(sourcePipeName, source)); } public Flow build(FlowStepListener listener, Properties cascadingProperties, Map<String, Tap> sources) throws IOException { return build(listener, new HadoopFlowConnectorFactory(cascadingProperties), sources); } // Build a single domain public Flow build(FlowStepListener listener, FlowConnectorFactory flowConnectorFactory, Map<String, Tap> sources) throws IOException { pipe = new DomainBuilderAssembly(properties.getDomainName(), pipe, keyFieldName, valueFieldName, properties.shouldPartitionAndSortInput(), partitionToBuild); // Open new version and check for success openNewVersion(); Flow<JobConf> flow = null; try { // Build flow flow = getFlow(flowConnectorFactory, sources); // Set up job DomainBuilderOutputCommitter.setupJob(properties.getDomainName(), flow.getConfig()); // Attach listener callback to get updates about job progress flow.addStepListener(listener); // Complete flow flow.complete(); // Commit job DomainBuilderOutputCommitter.commitJob(properties.getDomainName(), flow.getConfig()); } catch (Exception e) { String exceptionMessage = "Failed at building version " + domainVersionNumber + " of domain " + properties.getDomainName() + ". Cancelling version."; // In case of failure, cancel this new version cancelNewVersion(); // Clean up job if (flow != null) { DomainBuilderOutputCommitter.cleanupJob(properties.getDomainName(), flow.getConfig()); } e.printStackTrace(); throw new IOException(exceptionMessage, e); } // Close the new version closeNewVersion(); // Clean up job DomainBuilderOutputCommitter.cleanupJob(properties.getDomainName(), flow.getConfig()); return flow; } public static Flow buildDomains(Properties cascadingProperties, Map<String, Tap> sources, Map<String, Tap> otherSinks, Pipe[] otherTails, CascadingDomainBuilder... domainBuilders) throws IOException { return buildDomains(new HadoopFlowConnectorFactory(cascadingProperties), cascadingProperties, sources, otherSinks, otherTails, domainBuilders); } // Build multiple domains public static Flow buildDomains(FlowConnectorFactory flowConnectorFactory, Properties cascadingProperties, Map<String, Tap> sources, Map<String, Tap> otherSinks, Pipe[] otherTails, CascadingDomainBuilder... domainBuilders) throws IOException { // Info output for (CascadingDomainBuilder domainBuilder : domainBuilders) { LOG.info("Building domain with " + domainBuilder.toString()); } Flow<JobConf> flow = null; try { // Open new versions for (CascadingDomainBuilder domainBuilder : domainBuilders) { domainBuilder.openNewVersion(); } // Create tails for (CascadingDomainBuilder domainBuilder : domainBuilders) { domainBuilder.pipe = new DomainBuilderAssembly(domainBuilder.properties.getDomainName(), domainBuilder.pipe, domainBuilder.keyFieldName, domainBuilder.valueFieldName, domainBuilder.properties.shouldPartitionAndSortInput(), domainBuilder.partitionToBuild); } // Update properties for (CascadingDomainBuilder domainBuilder : domainBuilders) { domainBuilder.properties.setCascadingProperties(cascadingProperties, domainBuilder.domainVersionNumber, domainBuilder.numPartitions); } // Add partition marker sources Map<String, Tap> actualSources = new HashMap<String, Tap>(sources); for (CascadingDomainBuilder domainBuilder : domainBuilders) { actualSources.put(DomainBuilderAssembly.getPartitionMarkersPipeName(domainBuilder.properties.getDomainName()), new PartitionMarkerTap(domainBuilder.properties.getDomainName(), domainBuilder.keyFieldName, domainBuilder.valueFieldName)); } // Construct tails array Pipe[] tails = new Pipe[domainBuilders.length + otherTails.length]; // Copy tails from domain builders for (int i = 0; i < domainBuilders.length; ++i) { tails[i] = domainBuilders[i].pipe; } // Copy extra tails for (int i = 0; i < otherTails.length; ++i) { tails[i + domainBuilders.length] = otherTails[i]; } // Construct sinks map Map<String, Tap> sinks = new HashMap<String, Tap>(); // Add domain builder sinks for (CascadingDomainBuilder domainBuilder : domainBuilders) { sinks.put(DomainBuilderAssembly.getSinkName(domainBuilder.properties.getDomainName()), domainBuilder.outputTap); } // Add extra sinks sinks.putAll(otherSinks); // Create job name String StringBuilder jobName = new StringBuilder("HankCascadingDomainBuilder "); for (int i = 0; i < domainBuilders.length; ++i) { if (i != 0) { jobName.append(", "); } CascadingDomainBuilder domainBuilder = domainBuilders[i]; jobName.append(domainBuilder.properties.getDomainName()).append(" version ") .append(domainBuilder.domainVersionNumber); } // Build flow flow = flowConnectorFactory.create(cascadingProperties).connect(jobName.toString(), actualSources, sinks, tails); // Set up jobs for (CascadingDomainBuilder domainBuilder : domainBuilders) { DomainBuilderOutputCommitter.setupJob(domainBuilder.properties.getDomainName(), flow.getConfig()); } // Complete flow flow.complete(); // Commit jobs for (CascadingDomainBuilder domainBuilder : domainBuilders) { DomainBuilderOutputCommitter.commitJob(domainBuilder.properties.getDomainName(), flow.getConfig()); } } catch (Exception e) { // In case of failure, cancel new versions for (CascadingDomainBuilder domainBuilder : domainBuilders) { domainBuilder.cancelNewVersion(); // Clean up jobs if (flow != null) { DomainBuilderOutputCommitter.cleanupJob(domainBuilder.properties.getDomainName(), flow.getConfig()); } } e.printStackTrace(); throw new IOException("Failed at building domains. Cancelling open versions.", e); } // Close new versions for (CascadingDomainBuilder domainBuilder : domainBuilders) { domainBuilder.closeNewVersion(); // Clean up jobs DomainBuilderOutputCommitter.cleanupJob(domainBuilder.properties.getDomainName(), flow.getConfig()); } return flow; } public Integer getDomainVersionNumber() { return domainVersionNumber; } private DomainVersionProperties getDomainVersionProperties() { return domainVersionProperties; } // Build a single domain using a single source public Flow build(FlowStepListener listener, Map<Object, Object> cascadingProperties, String sourcePipeName, Tap source) throws IOException { return build(listener, mapToProperties(cascadingProperties), sourcePipeName, source); } // Build a single domain using a multiple sources public Flow build(FlowStepListener listener, Map<Object, Object> cascadingProperties, Map<String, Tap> sources) throws IOException { return build(listener, mapToProperties(cascadingProperties), sources); } // Build multiple domains public static Flow buildDomains(Properties cascadingProperties, Map<String, Tap> sources, CascadingDomainBuilder... domainBuilders) throws IOException { return buildDomains(cascadingProperties, sources, new HashMap<String, Tap>(), new Pipe[0], domainBuilders); } // Build multiple domains public static Flow buildDomains(Map<Object, Object> cascadingProperties, Map<String, Tap> sources, CascadingDomainBuilder... domainBuilders) throws IOException { return buildDomains(mapToProperties(cascadingProperties), sources, new HashMap<String, Tap>(), new Pipe[0], domainBuilders); } // Build multiple domains public static Flow buildDomains(Map<Object, Object> cascadingProperties, Map<String, Tap> sources, Map<String, Tap> otherSinks, Pipe[] otherTails, CascadingDomainBuilder... domainBuilders) throws IOException { return buildDomains(mapToProperties(cascadingProperties), sources, otherSinks, otherTails, domainBuilders); } public Properties getProperties() { return properties.setCascadingProperties(new Properties(), domainVersionNumber, numPartitions); } public String toString() { return "CascadingDomainBuilder: Domain: " + properties.getDomainName() + ", Output Tap: " + outputTap; } private static Properties mapToProperties(Map<Object, Object> properties) { Properties newProperties = new Properties(); newProperties.putAll(properties); return newProperties; } private String getFlowName() { return "HankCascadingDomainBuilder: " + properties.getDomainName() + " version " + domainVersionNumber; } private Flow<JobConf> getFlow(FlowConnectorFactory flowConnectorFactory, Map<String, Tap> sources) { Map<String, Tap> actualSources = new HashMap<String, Tap>(sources); actualSources.put( DomainBuilderAssembly.getPartitionMarkersPipeName(properties.getDomainName()), new PartitionMarkerTap(properties.getDomainName(), keyFieldName, valueFieldName)); return flowConnectorFactory.create(getProperties()).connect(getFlowName(), actualSources, outputTap, pipe); } }