/**
* Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.thirdeye.hadoop;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobStatus;
import org.joda.time.DateTime;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseConstants;
import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob;
import com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseConstants;
import com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseJob;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants;
import com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseConstants;
import com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseJob;
import com.linkedin.thirdeye.hadoop.join.JoinPhaseJob;
import com.linkedin.thirdeye.hadoop.push.SegmentPushPhase;
import com.linkedin.thirdeye.hadoop.push.SegmentPushPhaseConstants;
import com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants;
import com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseJob;
import com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants;
import com.linkedin.thirdeye.hadoop.topk.TopKPhaseJob;
import com.linkedin.thirdeye.hadoop.transform.TransformPhaseJob;
import com.linkedin.thirdeye.hadoop.wait.WaitPhaseJob;
/**
* Wrapper to manage segment create and segment push jobs for thirdeye
*/
public class ThirdEyeJob {
private static final Logger LOGGER = LoggerFactory.getLogger(ThirdEyeJob.class);
private static final String USAGE = "usage: phase_name job.properties";
private final String phaseName;
private final Properties inputConfig;
public ThirdEyeJob(String jobName, Properties config) {
String phaseFromConfig = config.getProperty(ThirdEyeJobProperties.THIRDEYE_PHASE.getName());
if (phaseFromConfig != null) {
this.phaseName = phaseFromConfig;
} else {
this.phaseName = jobName;
}
this.inputConfig = config;
}
private enum PhaseSpec {
BACKFILL {
@Override
Class<?> getKlazz() {
return BackfillPhaseJob.class;
}
@Override
String getDescription() {
return "Backfills older pinot segments with star tree index and topk information";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths)
throws Exception {
Properties config = new Properties();
config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_CONTROLLER_HOST.toString(),
inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_HOSTS.getName()));
config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_CONTROLLER_PORT.toString(),
inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_PORT.getName()));
config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_START_TIME.toString(),
inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_BACKFILL_START_TIME.getName()));
config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_END_TIME.toString(),
inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_BACKFILL_END_TIME.getName()));
config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_OUTPUT_PATH.toString(),
getIndexDir(root, collection, minTime, maxTime) + File.separator + BACKFILL.getName());
config.setProperty(BackfillPhaseConstants.BACKFILL_PHASE_TABLE_NAME.toString(), collection);
return config;
}
},
WAIT {
@Override
Class<?> getKlazz() {
return null;
}
@Override
String getDescription() {
return "Polls a pre-determined amount of time for the existence of input paths";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths)
throws Exception {
return null;
}
},
JOIN {
@Override
Class<?> getKlazz() {
return JoinPhaseJob.class;
}
@Override
String getDescription() {
return "Joins multiple data sets based on join key";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths) {
return inputConfig;
}
},
TRANSFORM {
@Override
Class<?> getKlazz() {
return TransformPhaseJob.class;
}
@Override
String getDescription() {
return "Transforms avro record";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths) {
return inputConfig;
}
},
AGGREGATION {
@Override
Class<?> getKlazz() {
return AggregationPhaseJob.class;
}
@Override
String getDescription() {
return "Aggregates input avro data to another time granularity";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths)
throws Exception {
Properties config = new Properties();
config.setProperty(AggregationPhaseConstants.AGG_PHASE_INPUT_PATH.toString(),
inputPaths);
config.setProperty(AggregationPhaseConstants.AGG_PHASE_OUTPUT_PATH.toString(),
getIndexDir(root, collection, minTime, maxTime) + File.separator
+ AGGREGATION.getName());
return config;
}
},
TOPK {
@Override
Class<?> getKlazz() {
return TopKPhaseJob.class;
}
@Override
String getDescription() {
return "Topk";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths)
throws Exception {
Properties config = new Properties();
Path aggOutputPath = new Path(getIndexDir(root, collection, minTime, maxTime) + File.separator + AGGREGATION.getName());
FileSystem fs = FileSystem.get(new Configuration());
if (fs.exists(aggOutputPath)) {
inputPaths = aggOutputPath.toString();
}
config.setProperty(TopKPhaseConstants.TOPK_PHASE_INPUT_PATH.toString(),
inputPaths);
config.setProperty(TopKPhaseConstants.TOPK_PHASE_OUTPUT_PATH.toString(),
getIndexDir(root, collection, minTime, maxTime) + File.separator
+ TOPK.getName());
return config;
}
},
DERIVED_COLUMN_TRANSFORMATION {
@Override
Class<?> getKlazz() {
return DerivedColumnTransformationPhaseJob.class;
}
@Override
String getDescription() {
return "Adds new columns for dimensions with topk or whitelist";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths)
throws Exception {
Properties config = new Properties();
Path aggOutputPath = new Path(getIndexDir(root, collection, minTime, maxTime) + File.separator + AGGREGATION.getName());
FileSystem fs = FileSystem.get(new Configuration());
if (fs.exists(aggOutputPath)) {
inputPaths = aggOutputPath.toString();
}
config.setProperty(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH.toString(),
inputPaths);
config.setProperty(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(),
getIndexDir(root, collection, minTime, maxTime));
config.setProperty(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH.toString(),
getIndexDir(root, collection, minTime, maxTime) + File.separator
+ DERIVED_COLUMN_TRANSFORMATION.getName());
config.setProperty(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH.toString(),
getIndexDir(root, collection, minTime, maxTime) + File.separator + TOPK.getName());
return config;
}
},
SEGMENT_CREATION {
@Override
Class<?> getKlazz() {
return SegmentCreationPhaseJob.class;
}
@Override
String getDescription() {
return "Generates pinot segments";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths)
throws Exception {
Properties config = new Properties();
Path derivedOutputPath = new Path(getIndexDir(root, collection, minTime, maxTime) + File.separator +
DERIVED_COLUMN_TRANSFORMATION.getName());
Path aggregationOutputPath = new Path(getIndexDir(root, collection, minTime, maxTime) + File.separator +
AGGREGATION.getName());
FileSystem fs = FileSystem.get(new Configuration());
if (fs.exists(derivedOutputPath)) {
inputPaths = derivedOutputPath.toString();
} else if (fs.exists(aggregationOutputPath)) {
inputPaths = aggregationOutputPath.toString();
}
config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_INPUT_PATH.toString(), inputPaths);
config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_OUTPUT_PATH.toString(),
getIndexDir(root, collection, minTime, maxTime) + File.separator + SEGMENT_CREATION.getName());
config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_WALLCLOCK_START_TIME.toString(),
String.valueOf(minTime.getMillis()));
config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_WALLCLOCK_END_TIME.toString(),
String.valueOf(maxTime.getMillis()));
String schedule = inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_FLOW_SCHEDULE.getName());
config.setProperty(SegmentCreationPhaseConstants.SEGMENT_CREATION_SCHEDULE.toString(), schedule);
return config;
}
},
SEGMENT_PUSH {
@Override
Class<?> getKlazz() {
return SegmentPushPhase.class;
}
@Override
String getDescription() {
return "Pushes pinot segments to pinot controller";
}
@Override
Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths)
throws Exception {
Properties config = new Properties();
config.setProperty(SegmentPushPhaseConstants.SEGMENT_PUSH_INPUT_PATH.toString(),
getIndexDir(root, collection, minTime, maxTime) + File.separator + SEGMENT_CREATION.getName());
config.setProperty(SegmentPushPhaseConstants.SEGMENT_PUSH_CONTROLLER_HOSTS.toString(),
inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_HOSTS.getName()));
config.setProperty(SegmentPushPhaseConstants.SEGMENT_PUSH_CONTROLLER_PORT.toString(),
inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_PINOT_CONTROLLER_PORT.getName()));
return config;
}
};
abstract Class<?> getKlazz();
abstract String getDescription();
abstract Properties getJobProperties(Properties inputConfig, String root, String collection,
DateTime minTime, DateTime maxTime, String inputPaths) throws Exception;
String getName() {
return this.name().toLowerCase();
}
String getIndexDir(String root, String collection, DateTime minTime,
DateTime maxTime) throws IOException {
return getCollectionDir(root, collection) + File.separator
+ "data_" + ThirdEyeConstants.DATE_TIME_FORMATTER.print(minTime) + "_"
+ ThirdEyeConstants.DATE_TIME_FORMATTER.print(maxTime);
}
}
private static void usage() {
System.err.println(USAGE);
for (PhaseSpec phase : PhaseSpec.values()) {
System.err.printf("%-30s : %s\n", phase.getName(), phase.getDescription());
}
}
private static String getAndCheck(String name, Properties properties) {
String value = properties.getProperty(name);
if (value == null) {
throw new IllegalArgumentException("Must provide " + name);
}
return value;
}
private static String getCollectionDir(String root, String collection) {
return root == null ? collection : root + File.separator + collection;
}
private void setMapreduceConfig(Configuration configuration) {
String mapreduceConfig =
inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_MR_CONF.getName());
if (mapreduceConfig != null && !mapreduceConfig.isEmpty()) {
String[] options = mapreduceConfig.split(",");
for (String option : options) {
String[] configs = option.split("=", 2);
if (configs.length == 2) {
LOGGER.info("Setting job configuration {} to {}", configs[0], configs[1]);
configuration.set(configs[0], configs[1]);
}
}
}
}
@SuppressWarnings("unchecked")
public void run() throws Exception {
LOGGER.info("Input config:{}", inputConfig);
PhaseSpec phaseSpec;
try {
phaseSpec = PhaseSpec.valueOf(phaseName.toUpperCase());
} catch (Exception e) {
usage();
throw e;
}
if (PhaseSpec.TRANSFORM.equals(phaseSpec)) {
TransformPhaseJob job = new TransformPhaseJob("Transform Job", inputConfig);
job.run();
return;
} else if (PhaseSpec.JOIN.equals(phaseSpec)) {
JoinPhaseJob job = new JoinPhaseJob("Join Job", inputConfig);
job.run();
return;
} else if (PhaseSpec.WAIT.equals(phaseSpec)) {
WaitPhaseJob job = new WaitPhaseJob("Wait for inputs", inputConfig);
job.run();
return;
}
// Get root, collection, input paths
String root = getAndCheck(ThirdEyeJobProperties.THIRDEYE_ROOT.getName(), inputConfig);
String collection =
getAndCheck(ThirdEyeJobProperties.THIRDEYE_COLLECTION.getName(), inputConfig);
String inputPaths = getAndCheck(ThirdEyeJobProperties.INPUT_PATHS.getName(), inputConfig);
// Get min / max time
DateTime minTime;
DateTime maxTime;
String minTimeProp = inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_TIME_MIN.getName());
String maxTimeProp = inputConfig.getProperty(ThirdEyeJobProperties.THIRDEYE_TIME_MAX.getName());
minTime = ISODateTimeFormat.dateTimeParser().parseDateTime(minTimeProp);
maxTime = ISODateTimeFormat.dateTimeParser().parseDateTime(maxTimeProp);
Properties jobProperties = phaseSpec.getJobProperties(inputConfig, root, collection,
minTime, maxTime, inputPaths);
for (Object key : inputConfig.keySet()) {
jobProperties.setProperty(key.toString(), inputConfig.getProperty(key.toString()));
}
// Instantiate the job
Constructor<Configured> constructor = (Constructor<Configured>) phaseSpec.getKlazz()
.getConstructor(String.class, Properties.class);
Configured instance = constructor.newInstance(phaseSpec.getName(), jobProperties);
setMapreduceConfig(instance.getConf());
// Run the job
Method runMethod = instance.getClass().getMethod("run");
Job job = (Job) runMethod.invoke(instance);
if (job != null) {
JobStatus status = job.getStatus();
if (status.getState() != JobStatus.State.SUCCEEDED) {
throw new RuntimeException(
"Job " + job.getJobName() + " failed to execute: Ran with config:" + jobProperties);
}
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
usage();
System.exit(1);
}
String phaseName = args[0];
Properties config = new Properties();
config.load(new FileInputStream(args[1]));
new ThirdEyeJob(phaseName, config).run();
}
}