SegmentCreationPhaseMapReduceJob.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.thirdeye.hadoop.segment.creation;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import com.google.common.base.Joiner;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.StarTreeIndexSpec;
import com.linkedin.pinot.common.data.TimeGranularitySpec.TimeFormat;
import com.linkedin.pinot.common.utils.TarGzCompressionUtils;
import com.linkedin.pinot.core.data.readers.FileFormat;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.segment.creator.StatsCollectorConfig;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.core.segment.creator.impl.stats.LongColumnPreIndexStatsCollector;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants;
import com.linkedin.thirdeye.hadoop.util.ThirdeyePinotSchemaUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.linkedin.pinot.core.segment.creator.impl.V1Constants.MetadataKeys.Segment.SEGMENT_START_TIME;
import static com.linkedin.pinot.core.segment.creator.impl.V1Constants.MetadataKeys.Segment.SEGMENT_END_TIME;
import static com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants.SEGMENT_CREATION_OUTPUT_PATH;
import static com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants.SEGMENT_CREATION_THIRDEYE_CONFIG;
import static com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants.SEGMENT_CREATION_WALLCLOCK_START_TIME;
import static com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants.SEGMENT_CREATION_WALLCLOCK_END_TIME;
import static com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants.SEGMENT_CREATION_SCHEDULE;
import static com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseConstants.SEGMENT_CREATION_BACKFILL;

/**
 * Mapper class for SegmentCreation job, which sets configs required for
 * segment generation with star tree index
 */
public class SegmentCreationPhaseMapReduceJob {

  public static class SegmentCreationMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
    private static final Logger LOGGER = LoggerFactory.getLogger(SegmentCreationPhaseMapReduceJob.class);
    private static ObjectMapper OBJECT_MAPPER = new ObjectMapper(new YAMLFactory());

    private Configuration properties;

    private String inputFilePath;
    private String outputPath;
    private String tableName;

    private Path currentHdfsWorkDir;
    private String currentDiskWorkDir;

    // Temporary HDFS path for local machine
    private String localHdfsSegmentTarPath;

    private String localDiskSegmentDirectory;
    private String localDiskSegmentTarPath;

    private ThirdEyeConfig thirdeyeConfig;
    private Schema schema;

    private Long segmentWallClockStartTime;
    private Long segmentWallClockEndTime;
    private String segmentSchedule;
    private boolean isBackfill;

    @Override
    public void setup(Context context) throws IOException, InterruptedException {

      currentHdfsWorkDir = FileOutputFormat.getWorkOutputPath(context);
      currentDiskWorkDir = "pinot_hadoop_tmp";

      // Temporary HDFS path for local machine
      localHdfsSegmentTarPath = currentHdfsWorkDir + "/segmentTar";

      // Temporary DISK path for local machine
      localDiskSegmentDirectory = currentDiskWorkDir + "/segments/";
      localDiskSegmentTarPath = currentDiskWorkDir + "/segmentsTar/";
      new File(localDiskSegmentTarPath).mkdirs();

      LOGGER.info("*********************************************************************");
      LOGGER.info("Configurations : {}", context.getConfiguration().toString());
      LOGGER.info("*********************************************************************");
      LOGGER.info("Current HDFS working dir : {}", currentHdfsWorkDir);
      LOGGER.info("Current DISK working dir : {}", new File(currentDiskWorkDir).getAbsolutePath());
      LOGGER.info("*********************************************************************");
      properties = context.getConfiguration();

      outputPath = properties.get(SEGMENT_CREATION_OUTPUT_PATH.toString());

      thirdeyeConfig = OBJECT_MAPPER.readValue(properties.get(SEGMENT_CREATION_THIRDEYE_CONFIG.toString()), ThirdEyeConfig.class);
      LOGGER.info(thirdeyeConfig.encode());
      schema = ThirdeyePinotSchemaUtils.createSchema(thirdeyeConfig);
      tableName = thirdeyeConfig.getCollection();

      segmentWallClockStartTime = Long.valueOf(properties.get(SEGMENT_CREATION_WALLCLOCK_START_TIME.toString()));
      segmentWallClockEndTime = Long.valueOf(properties.get(SEGMENT_CREATION_WALLCLOCK_END_TIME.toString()));
      segmentSchedule = properties.get(SEGMENT_CREATION_SCHEDULE.toString());
      isBackfill = Boolean.valueOf(properties.get(SEGMENT_CREATION_BACKFILL.toString()));
    }

    @Override
    public void cleanup(Context context) throws IOException, InterruptedException {
      FileUtils.deleteQuietly(new File(currentDiskWorkDir));
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

      String line = value.toString();
      String[] lineSplits = line.split(" ");

      LOGGER.info("*********************************************************************");
      LOGGER.info("mapper input : {}", value);
      LOGGER.info("Path to output : {}", outputPath);
      LOGGER.info("Table name : {}", tableName);
      LOGGER.info("num lines : {}", lineSplits.length);

      for (String split : lineSplits) {
        LOGGER.info("Command line : {}", split);
      }
      LOGGER.info("*********************************************************************");

      if (lineSplits.length != 3) {
        throw new RuntimeException("Input to the mapper is malformed, please contact the pinot team");
      }
      inputFilePath = lineSplits[1].trim();

      LOGGER.info("*********************************************************************");
      LOGGER.info("input data file path : {}", inputFilePath);
      LOGGER.info("local hdfs segment tar path: {}", localHdfsSegmentTarPath);
      LOGGER.info("local disk segment path: {}", localDiskSegmentDirectory);
      LOGGER.info("*********************************************************************");

      try {
        createSegment(inputFilePath, schema, lineSplits[2]);
        LOGGER.info("finished segment creation job successfully");
      } catch (Exception e) {
        LOGGER.error("Got exceptions during creating segments!", e);
      }

      context.write(new LongWritable(Long.parseLong(lineSplits[2])),
          new Text(FileSystem.get(new Configuration()).listStatus(new Path(localHdfsSegmentTarPath + "/"))[0].getPath().getName()));
      LOGGER.info("finished the job successfully");
    }

    private String createSegment(String dataFilePath, Schema schema, String seqId) throws Exception {
      final FileSystem fs = FileSystem.get(new Configuration());
      final Path hdfsDataPath = new Path(dataFilePath);
      final File dataPath = new File(currentDiskWorkDir, "data");
      if (dataPath.exists()) {
        dataPath.delete();
      }
      dataPath.mkdir();
      final Path localFilePath = new Path(dataPath + "/" + hdfsDataPath.getName());
      fs.copyToLocalFile(hdfsDataPath, localFilePath);

      LOGGER.info("Data schema is : {}", schema);

      // Set segment generator config
      LOGGER.info("*********************************************************************");
      SegmentGeneratorConfig segmentGeneratorConfig = new SegmentGeneratorConfig(schema);
      segmentGeneratorConfig.setTableName(tableName);
      segmentGeneratorConfig.setInputFilePath(new File(dataPath, hdfsDataPath.getName()).getAbsolutePath());
      LOGGER.info("Setting input path {}", segmentGeneratorConfig.getInputFilePath());
      segmentGeneratorConfig.setFormat(FileFormat.AVRO);
      segmentGeneratorConfig.setSegmentNamePostfix(seqId);
      segmentGeneratorConfig.setOutDir(localDiskSegmentDirectory);
      segmentGeneratorConfig.setEnableStarTreeIndex(true);
      LOGGER.info("Setting enableStarTreeIndex");
      String minTime = ThirdEyeConstants.DATE_TIME_FORMATTER.print(segmentWallClockStartTime);
      String maxTime = ThirdEyeConstants.DATE_TIME_FORMATTER.print(segmentWallClockEndTime);
      LOGGER.info("Wall clock time : min {} max {}", minTime, maxTime);
      LOGGER.info("isBackfill : {}", isBackfill);
      if (isBackfill) {
        // if case of backfill, we have to ensure that segment name is same as original segment name
        // we are retaining the segment name through the  backfill and derived_column_transformation phases
        // in the output files generated
        // backfill will generated original_segment_name.avro
        // derived_column_transformation will generate original_segment_name-m-00000.avro etc
        String segmentName = hdfsDataPath.getName().split("-(m|r)-[0-9]{5}")[0];
        segmentName = segmentName.split(ThirdEyeConstants.AVRO_SUFFIX)[0];
        segmentGeneratorConfig.setSegmentName(segmentName);
      } else {
        String segmentName =
            Joiner.on(ThirdEyeConstants.SEGMENT_JOINER).join(tableName, segmentSchedule, minTime, maxTime, seqId);
        segmentGeneratorConfig.setSegmentName(segmentName);
      }
      LOGGER.info("Setting segment name {}", segmentGeneratorConfig.getSegmentName());


      // Set star tree config
      StarTreeIndexSpec starTreeIndexSpec = new StarTreeIndexSpec();

      // _raw dimensions should not be in star tree split order
      // if a dimension has a _topk column, we will include only
      // the column with topk, and skip _raw column for materialization in star tree
      Set<String> skipMaterializationForDimensions = new HashSet<>();
      Set<String> transformDimensionsSet = thirdeyeConfig.getTransformDimensions();
      LOGGER.info("Dimensions with _topk column {}", transformDimensionsSet);
      for (String topkTransformDimension : transformDimensionsSet) {
        skipMaterializationForDimensions.add(topkTransformDimension);
        LOGGER.info("Adding {} to skipMaterialization set", topkTransformDimension);
      }
      starTreeIndexSpec.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
      LOGGER.info("Setting skipMaterializationForDimensions {}", skipMaterializationForDimensions);

      if (thirdeyeConfig.getSplit() != null) {
        starTreeIndexSpec.setMaxLeafRecords(thirdeyeConfig.getSplit().getThreshold());
        LOGGER.info("Setting split threshold to {}", starTreeIndexSpec.getMaxLeafRecords());
        List<String> splitOrder = thirdeyeConfig.getSplit().getOrder();
        if (splitOrder != null) {
          LOGGER.info("Removing from splitOrder, any dimensions which are also in skipMaterializationForDimensions");
          splitOrder.removeAll(skipMaterializationForDimensions);
          starTreeIndexSpec.setDimensionsSplitOrder(splitOrder);
        }
        LOGGER.info("Setting splitOrder {}", splitOrder);
      }
      segmentGeneratorConfig.setStarTreeIndexSpec(starTreeIndexSpec);
      LOGGER.info("*********************************************************************");

      // Set time for SIMPLE_DATE_FORMAT case
      String sdfPrefix = TimeFormat.SIMPLE_DATE_FORMAT.toString() + ThirdEyeConstants.SDF_SEPARATOR;
      if (thirdeyeConfig.getTime().getTimeFormat().startsWith(sdfPrefix)) {

        String pattern = thirdeyeConfig.getTime().getTimeFormat().split(ThirdEyeConstants.SDF_SEPARATOR)[1];
        DateTimeFormatter sdfFormatter = DateTimeFormat.forPattern(pattern);

        File localAvroFile = new File(dataPath, hdfsDataPath.getName());
        LongColumnPreIndexStatsCollector timeColumnStatisticsCollector =
            getTimeColumnStatsCollector(schema, localAvroFile);
        String startTime = timeColumnStatisticsCollector.getMinValue().toString();
        String endTime = timeColumnStatisticsCollector.getMaxValue().toString();
        startTime = String.valueOf(DateTime.parse(startTime, sdfFormatter).getMillis());
        endTime = String.valueOf(DateTime.parse(endTime, sdfFormatter).getMillis());

        // set start time
        segmentGeneratorConfig.getCustomProperties().put(SEGMENT_START_TIME, startTime);
        // set end time
        segmentGeneratorConfig.getCustomProperties().put(SEGMENT_END_TIME, endTime);
        // set time unit
        segmentGeneratorConfig.setSegmentTimeUnit(TimeUnit.MILLISECONDS);
      }

      // Generate segment
      SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
      driver.init(segmentGeneratorConfig);
      driver.build();

      // Tar the segment directory into file.
      String segmentName = null;
      File localDiskSegmentDirectoryFile = new File(localDiskSegmentDirectory);
      for (File file : localDiskSegmentDirectoryFile.listFiles()) {
        segmentName = file.getName();
        if (segmentName.startsWith(tableName)) {
          break;
        }
      }
      String localSegmentPath = new File(localDiskSegmentDirectory, segmentName).getAbsolutePath();

      String localTarPath = localDiskSegmentTarPath + "/" + segmentName + ".tar.gz";
      LOGGER.info("Trying to tar the segment to: {}", localTarPath);
      TarGzCompressionUtils.createTarGzOfDirectory(localSegmentPath, localTarPath);
      String hdfsTarPath = localHdfsSegmentTarPath + "/" + segmentName + ".tar.gz";

      LOGGER.info("*********************************************************************");
      LOGGER.info("Copy from : {} to {}", localTarPath, hdfsTarPath);
      LOGGER.info("*********************************************************************");
      fs.copyFromLocalFile(true, true, new Path(localTarPath), new Path(hdfsTarPath));
      return segmentName;
    }

    private LongColumnPreIndexStatsCollector getTimeColumnStatsCollector(Schema schema, File localAvroFile)
        throws FileNotFoundException, IOException {
      String timeColumnName = schema.getTimeColumnName();
      FieldSpec spec =  schema.getTimeFieldSpec();
      LOGGER.info("Spec for " + timeColumnName + " is " + spec);
      LongColumnPreIndexStatsCollector timeColumnStatisticsCollector = new LongColumnPreIndexStatsCollector(spec.getName(), new StatsCollectorConfig(schema, null));
      LOGGER.info("StatsCollector :" + timeColumnStatisticsCollector);
      DataFileStream<GenericRecord> dataStream =
          new DataFileStream<GenericRecord>(new FileInputStream(localAvroFile), new GenericDatumReader<GenericRecord>());
      while (dataStream.hasNext()) {
        GenericRecord next = dataStream.next();
        timeColumnStatisticsCollector.collect(next.get(timeColumnName));
      }
      dataStream.close();
      timeColumnStatisticsCollector.seal();

      return timeColumnStatisticsCollector;
    }

  }
}