package com.liveramp.hank.hadoop; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.Map; import java.util.Properties; import java.util.UUID; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.mapred.JobConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import cascading.flow.FlowProcess; import com.liveramp.hank.config.CoordinatorConfigurator; import com.liveramp.hank.coordinator.Coordinator; import com.liveramp.hank.coordinator.Domain; import com.liveramp.hank.coordinator.DomainVersion; import com.liveramp.hank.coordinator.DomainVersionProperties; import com.liveramp.hank.coordinator.RunWithCoordinator; import com.liveramp.hank.coordinator.RunnableWithCoordinator; import com.liveramp.hank.storage.FileOpsUtil; public class DomainBuilderProperties { private static final String TMP_OUTPUT_PATH = "_temporary/"; public static final String REMOTE_DOMAIN_ROOT_STORAGE_ENGINE_OPTION = "remote_domain_root"; private static final Class<? extends DomainBuilderAbstractOutputFormat> DEFAULT_OUTPUT_FORMAT_CLASS = DomainBuilderDefaultOutputFormat.class; private final String domainName; private final CoordinatorConfigurator configurator; private Class<? extends DomainBuilderAbstractOutputFormat> outputFormatClass = DEFAULT_OUTPUT_FORMAT_CLASS; private String outputPath = null; private String randomTmpOutputPathId; private boolean shouldPartitionAndSortInput = true; private static final Logger LOG = LoggerFactory.getLogger(DomainBuilderProperties.class); // With a default output format // Get output path from the Coordinator public DomainBuilderProperties(String domainName, CoordinatorConfigurator configurator) { this.domainName = domainName; this.configurator = configurator; } // With a specific output format // Get output path from the Coordinator public DomainBuilderProperties(String domainName, CoordinatorConfigurator configurator, Class<? extends DomainBuilderAbstractOutputFormat> outputFormatClass) { this.domainName = domainName; this.configurator = configurator; this.outputFormatClass = outputFormatClass; } public String getDomainName() { return domainName; } public CoordinatorConfigurator getConfigurator() { return configurator; } public String getOutputPath() throws IOException { if (outputPath == null) { this.outputPath = getRemoteDomainRoot(); } return outputPath; } public DomainBuilderProperties setOutputPath(String outputPath) { this.outputPath = outputPath; return this; } public String getTmpOutputPath(int versionNumber) { if (randomTmpOutputPathId == null) { randomTmpOutputPathId = UUID.randomUUID().toString(); } return outputPath + "/" + TMP_OUTPUT_PATH + "version_" + versionNumber + "_" + randomTmpOutputPathId + "/"; } public Class<? extends DomainBuilderAbstractOutputFormat> getOutputFormatClass() { return outputFormatClass; } public boolean shouldPartitionAndSortInput() { return shouldPartitionAndSortInput; } public DomainBuilderProperties setShouldPartitionAndSortInput(boolean shouldPartitionAndSortInput) { this.shouldPartitionAndSortInput = shouldPartitionAndSortInput; return this; } // To configure cascading jobs public Properties setCascadingProperties(Properties properties, int versionNumber, int numPartitions) { // Note: Domain name is set locally in DomainBuilderTap to deal with Cascading // jobs building multiple domains. // Configuration properties.setProperty(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_CONFIGURATOR), buildConfigurationString(configurator)); // Output Path properties.setProperty(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_OUTPUT_PATH), outputPath); // Tmp output path properties.setProperty(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_TMP_OUTPUT_PATH), getTmpOutputPath(versionNumber)); // Version Number properties.setProperty(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_VERSION_NUMBER), Integer.toString(versionNumber)); // Number of partitions properties.setProperty(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_NUM_PARTITIONS), Integer.toString(numPartitions)); // Number of reduce tasks is set to the maximum number of partitions to build for a single domain // When moving to Cascading 2.0 we will be able to set the number of reduce tasks per step (for each domain) Integer numPartitionsPrevious = 0; String numPartitionsStr = properties.getProperty("mapred.reduce.tasks"); if (numPartitionsStr != null) { numPartitionsPrevious = Integer.valueOf(numPartitionsStr); } if (numPartitions > numPartitionsPrevious) { properties.setProperty("mapred.reduce.tasks", String.valueOf(numPartitions)); } // this fixes hank domain building on YARN properties.setProperty("mapreduce.fileoutputcommitter.algorithm.version", "2"); return properties; } // To configure Hadoop MapReduce jobs public JobConf setJobConfProperties(JobConf conf, int versionNumber) throws IOException { // Domain name conf.set(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME, getDomainName()); // Configuration conf.set(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_CONFIGURATOR), buildConfigurationString(configurator)); // Output path conf.set(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_OUTPUT_PATH), getOutputPath()); // Tmp output path conf.set(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_TMP_OUTPUT_PATH), getTmpOutputPath(versionNumber)); // Version Number conf.set(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_VERSION_NUMBER), Integer.toString(versionNumber)); // this fixes hank domain building on YARN conf.set("mapreduce.fileoutputcommitter.algorithm.version", "2"); return conf; } // TODO: maybe refactor and move the flow process stuff to the cascading package // FlowProcess private static String getRequiredConfigurationItem(String key, String prettyName, FlowProcess flowProcess) { String result = (String)flowProcess.getProperty(key); if (result == null) { throw new RuntimeException(prettyName + " must be set with configuration item: " + key); } return result; } // JobConf public static String getDomainName(JobConf conf) { return getRequiredConfigurationItem(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME, "Hank domain name", conf); } public static CoordinatorConfigurator getConfigurator(JobConf conf) { String domainName = getDomainName(conf); String configurationItem = DomainBuilderAbstractOutputFormat.createConfParamName(domainName, DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_CONFIGURATOR); String configuratorString = getRequiredConfigurationItem(configurationItem, "Hank coordinator configuration", conf); CoordinatorConfigurator configurator; try { configurator = (CoordinatorConfigurator)new ObjectInputStream(new ByteArrayInputStream(Base64.decodeBase64(configuratorString.getBytes()))).readObject(); } catch (Exception e) { throw new RuntimeException("Hank Configurator is incorrectly serialized in configuration item: " + configurationItem, e); } return configurator; } public static CoordinatorConfigurator getConfigurator(String domainName, FlowProcess flowProcess) { String configurationItem = DomainBuilderAbstractOutputFormat.createConfParamName(domainName, DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_CONFIGURATOR); String configuratorString = getRequiredConfigurationItem(configurationItem, "Hank coordinator configuration", flowProcess); CoordinatorConfigurator configurator; try { configurator = (CoordinatorConfigurator)new ObjectInputStream(new ByteArrayInputStream(Base64.decodeBase64(configuratorString.getBytes()))).readObject(); } catch (Exception e) { throw new RuntimeException("Hank Configurator is incorrectly serialized in configuration item: " + configurationItem, e); } return configurator; } public static String getOutputPath(String domainName, JobConf conf) { return getRequiredConfigurationItem(DomainBuilderAbstractOutputFormat.createConfParamName(domainName, DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_OUTPUT_PATH), "Hank output path", conf); } public static String getTmpOutputPath(String domainName, JobConf conf) { return getRequiredConfigurationItem(DomainBuilderAbstractOutputFormat.createConfParamName(domainName, DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_TMP_OUTPUT_PATH), "Hank temporary output path", conf); } public static Integer getVersionNumber(String domainName, JobConf conf) { return Integer.valueOf(getRequiredConfigurationItem(DomainBuilderAbstractOutputFormat.createConfParamName(domainName, DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_VERSION_NUMBER), "Hank version number", conf)); } public static Integer getNumPartitions(String domainName, JobConf conf) { return Integer.valueOf(getRequiredConfigurationItem(DomainBuilderAbstractOutputFormat.createConfParamName(domainName, DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_NUM_PARTITIONS), "Hank number of partitions", conf)); } public static String getRequiredConfigurationItem(String key, String prettyName, JobConf conf) { String result = conf.get(key); if (result == null) { throw new RuntimeException(prettyName + " must be set with configuration item: " + key); } return result; } private static class DomainBuilderRemoteDomainRootGetter implements RunnableWithCoordinator { private final String domainName; private String result; public DomainBuilderRemoteDomainRootGetter(String domainName) { this.domainName = domainName; } @Override public void run(Coordinator coordinator) throws IOException { Domain domain = coordinator.getDomain(domainName); Map<String, Object> options = getOptions(domainName, domain); this.result = FileOpsUtil.getDomainBuilderRoot(options); } } private static class PartitionServerRemoteDomainRootGetter implements RunnableWithCoordinator { private final String domainName; private String result; public PartitionServerRemoteDomainRootGetter(String domainName) { this.domainName = domainName; } @Override public void run(Coordinator coordinator) throws IOException { Domain domain = coordinator.getDomain(domainName); Map<String, Object> options = getOptions(domainName, domain); this.result = FileOpsUtil.getPartitionServerRoot(options); } } private static Map<String, Object> getOptions(String domainName, Domain domain) { if (domain == null) { throw new RuntimeException("Could not get domain: " + domainName + " from coordinator."); } Map<String, Object> options = domain.getStorageEngineOptions(); if (options == null) { throw new RuntimeException("Empty options for domain: " + domainName); } return options; } public String getRemoteDomainRoot() throws IOException { DomainBuilderRemoteDomainRootGetter remoteDomainRootGetter = new DomainBuilderRemoteDomainRootGetter(domainName); RunWithCoordinator.run(configurator, remoteDomainRootGetter); return remoteDomainRootGetter.result; } @Deprecated public static String getRemoteDomainRoot(Coordinator coordinator, String domainName) throws IOException { return getDomainBuilderDomainRoot(coordinator, domainName); } public static String getDomainBuilderDomainRoot(Coordinator coordinator, String domainName) throws IOException { if (coordinator == null) { throw new RuntimeException("A null Coordinator was provided."); } DomainBuilderRemoteDomainRootGetter remoteDomainRootGetter = new DomainBuilderRemoteDomainRootGetter(domainName); remoteDomainRootGetter.run(coordinator); return remoteDomainRootGetter.result; } public static String getPartitionServerDomainRoot(Coordinator coordinator, String domainName) throws IOException { if(coordinator == null){ throw new RuntimeException("A null Coordinator was provided."); } PartitionServerRemoteDomainRootGetter remoteDomainRootGetter = new PartitionServerRemoteDomainRootGetter(domainName); remoteDomainRootGetter.run(coordinator); return remoteDomainRootGetter.result; } public DomainVersionNumberAndNumPartitions openVersion(DomainVersionProperties domainVersionProperties) throws IOException { return openVersion(getConfigurator(), getDomainName(), domainVersionProperties); } public void cancelVersion(Integer domainVersionNumber) throws IOException { cancelVersion(getConfigurator(), getDomainName(), domainVersionNumber); } public void closeVersion(Integer domainVersionNumber) throws IOException { closeVersion(getConfigurator(), getDomainName(), domainVersionNumber); } public static DomainVersionNumberAndNumPartitions openVersion(CoordinatorConfigurator configurator, String domainName, DomainVersionProperties domainVersionProperties) throws IOException { DomainVersionOpener domainVersionOpener = new DomainVersionOpener(domainName, domainVersionProperties); RunWithCoordinator.run(configurator, domainVersionOpener); return domainVersionOpener.result; } public static void cancelVersion(CoordinatorConfigurator configurator, String domainName, Integer domainVersionNumber) throws IOException { RunWithCoordinator.run(configurator, new DomainVersionCanceller(domainName, domainVersionNumber)); } public static void closeVersion(CoordinatorConfigurator configurator, String domainName, Integer domainVersionNumber) throws IOException { RunWithCoordinator.run(configurator, new DomainVersionCloser(domainName, domainVersionNumber)); } public static Domain getDomain(Coordinator coordinator, String domainName) throws IOException { Domain domain = coordinator.getDomainShallow(domainName); // Fail if unable to load domain if (domain == null) { throw new IOException("Could not load Domain: " + domainName + " with coordinator: " + coordinator); } return domain; } public static DomainVersion getDomainVersion(Coordinator coordinator, String domainName, Integer domainVersionNumber) throws IOException { if (domainVersionNumber == null) { return null; } Domain domain = getDomain(coordinator, domainName); DomainVersion domainVersion = domain.getVersionShallow(domainVersionNumber); if (domainVersion == null) { throw new IOException("Could not get version " + domainVersionNumber + " of domain " + domainName + " with coordinator: " + coordinator); } else { return domainVersion; } } // Builds a base64 encoded string of the serialized configurator private static String buildConfigurationString(CoordinatorConfigurator configurator) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { new ObjectOutputStream(baos).writeObject(configurator); } catch (IOException e) { throw new RuntimeException(e); } return new String(Base64.encodeBase64(baos.toByteArray())); } private static class DomainVersionOpener implements RunnableWithCoordinator { private final String domainName; private final DomainVersionProperties domainVersionProperties; private DomainVersionNumberAndNumPartitions result; public DomainVersionOpener(String domainName, DomainVersionProperties domainVersionProperties) { this.domainName = domainName; this.domainVersionProperties = domainVersionProperties; } @Override public void run(Coordinator coordinator) throws IOException { Domain domain = getDomain(coordinator, domainName); DomainVersion domainVersion = domain.openNewVersion(domainVersionProperties); if (domainVersion == null) { throw new IOException("Could not open a new version of domain " + domainName); } else { LOG.info("Opened new version #" + domainVersion.getVersionNumber() + " of domain: " + domainName); result = new DomainVersionNumberAndNumPartitions(domainVersion.getVersionNumber(), domain.getNumParts()); } } } private static class DomainVersionCanceller implements RunnableWithCoordinator { private final String domainName; private final Integer domainVersionNumber; public DomainVersionCanceller(String domainName, Integer domainVersionNumber) { this.domainName = domainName; this.domainVersionNumber = domainVersionNumber; } @Override public void run(Coordinator coordinator) throws IOException { DomainVersion domainVersion = getDomainVersion(coordinator, domainName, domainVersionNumber); LOG.info("Cancelling new version #" + domainVersion.getVersionNumber() + " of domain: " + domainName); domainVersion.cancel(); } } private static class DomainVersionCloser implements RunnableWithCoordinator { private final String domainName; private final Integer domainVersionNumber; public DomainVersionCloser(String domainName, Integer domainVersionNumber) { this.domainName = domainName; this.domainVersionNumber = domainVersionNumber; } @Override public void run(Coordinator coordinator) throws IOException { DomainVersion domainVersion = getDomainVersion(coordinator, domainName, domainVersionNumber); LOG.info("Closing new version #" + domainVersion.getVersionNumber() + " of domain: " + domainName); domainVersion.close(); } } public String toString() { return "<DomainBuilderProperties: domain name: " + domainName + ", configurator: " + configurator + ", output path: " + outputPath + ", output format class: " + outputFormatClass + ">"; } }