CsvBlurDriver.java example

Explorer
incubator-blur-master
package org.apache.blur.mapreduce.lib;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.blur.thrift.BlurClient;
import org.apache.blur.thrift.generated.Blur.Iface;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.GenericOptionsParser;

import com.google.common.base.Splitter;

@SuppressWarnings("static-access")
public class CsvBlurDriver {

  private static final Log LOG = LogFactory.getLog(CsvBlurDriver.class);

  public static final String CSVLOADER = "csvloader";
  public static final String MAPRED_COMPRESS_MAP_OUTPUT = "mapred.compress.map.output";
  public static final String MAPRED_MAP_OUTPUT_COMPRESSION_CODEC = "mapred.map.output.compression.codec";
  public static final int DEFAULT_WIDTH = 100;
  public static final String HEADER = "The \""
      + CSVLOADER
      + "\" command is used to load delimited into a Blur table.\nThe required options are \"-c\", \"-t\", \"-d\". The "
      + "standard format for the contents of a file is:\"rowid,recordid,family,col1,col2,...\". However there are "
      + "several options, such as the rowid and recordid can be generated based on the data in the record via the "
      + "\"-A\" and \"-a\" options. The family can assigned based on the path via the \"-I\" option. The column "
      + "name order can be mapped via the \"-d\" option. Also you can set the input "
      + "format to either sequence files vie the \"-S\" option or leave the default text files.";

  enum COMPRESSION {
    SNAPPY(SnappyCodec.class), GZIP(GzipCodec.class), BZIP(BZip2Codec.class), DEFAULT(DefaultCodec.class);

    private final String className;

    private COMPRESSION(Class<? extends CompressionCodec> clazz) {
      className = clazz.getName();
    }

    public String getClassName() {
      return className;
    }
  }

  interface ControllerPool {
    Iface getClient(String controllerConnectionStr);
  }

  public static void main(String... args) throws Exception {
    Configuration configuration = new Configuration();
    String[] otherArgs = new GenericOptionsParser(configuration, args).getRemainingArgs();
    AtomicReference<Callable<Void>> ref = new AtomicReference<Callable<Void>>();
    Job job = setupJob(configuration, new ControllerPool() {
      @Override
      public Iface getClient(String controllerConnectionStr) {
        return BlurClient.getClient(controllerConnectionStr);
      }
    }, ref, otherArgs);
    if (job == null) {
      System.exit(1);
    }
    boolean waitForCompletion = job.waitForCompletion(true);
    if (waitForCompletion) {
      Callable<Void> callable = ref.get();
      if (callable != null) {
        callable.call();
      }
    }
    System.exit(waitForCompletion ? 0 : 1);
  }

  public static Job setupJob(Configuration configuration, ControllerPool controllerPool,
      AtomicReference<Callable<Void>> ref, String... otherArgs) throws Exception {
    CommandLine cmd = parse(otherArgs);
    if (cmd == null) {
      return null;
    }

    final String controllerConnectionStr = cmd.getOptionValue("c");
    final String tableName = cmd.getOptionValue("t");

    final Iface client = controllerPool.getClient(controllerConnectionStr);
    TableDescriptor tableDescriptor = client.describe(tableName);

    Job job = Job.getInstance(configuration, "Blur indexer [" + tableName + "]");
    job.setJarByClass(CsvBlurDriver.class);
    job.setMapperClass(CsvBlurMapper.class);

    if (cmd.hasOption("p")) {
      job.getConfiguration().set(MAPRED_COMPRESS_MAP_OUTPUT, "true");
      String codecStr = cmd.getOptionValue("p");
      COMPRESSION compression;
      try {
        compression = COMPRESSION.valueOf(codecStr.trim().toUpperCase());
      } catch (IllegalArgumentException e) {
        compression = null;
      }
      if (compression == null) {
        job.getConfiguration().set(MAPRED_MAP_OUTPUT_COMPRESSION_CODEC, codecStr.trim());
      } else {
        job.getConfiguration().set(MAPRED_MAP_OUTPUT_COMPRESSION_CODEC, compression.getClassName());
      }
    }
    if (cmd.hasOption("a")) {
      CsvBlurMapper.setAutoGenerateRecordIdAsHashOfData(job, true);
    }
    if (cmd.hasOption("A")) {
      CsvBlurMapper.setAutoGenerateRowIdAsHashOfData(job, true);
    }
    if (cmd.hasOption("S")) {
      job.setInputFormatClass(SequenceFileInputFormat.class);
    } else {
      job.setInputFormatClass(TextInputFormat.class);
    }

    if (cmd.hasOption("C")) {
      if (cmd.hasOption("S")) {
        String[] optionValues = cmd.getOptionValues("C");
        job.setInputFormatClass(CsvBlurCombineSequenceFileInputFormat.class);
        CombineFileInputFormat.setMinInputSplitSize(job, Long.parseLong(optionValues[0]));
        CombineFileInputFormat.setMaxInputSplitSize(job, Long.parseLong(optionValues[1]));
      } else {
        System.err.println("'C' can only be used with option 'S'");
        return null;
      }
    }

    if (cmd.hasOption("i")) {
      for (String input : cmd.getOptionValues("i")) {
        Path path = new Path(input);
        Set<Path> pathSet = recurisvelyGetPathesContainingFiles(path, job.getConfiguration());
        if (pathSet.isEmpty()) {
          FileInputFormat.addInputPath(job, path);
        } else {
          for (Path p : pathSet) {
            FileInputFormat.addInputPath(job, p);
          }
        }
      }
    }
    // processing the 'I' option
    if (cmd.hasOption("I")) {
      if (cmd.hasOption("C")) {
        System.err.println("'I' and 'C' both parameters can not be used together.");
        return null;
      }
      Option[] options = cmd.getOptions();
      for (Option option : options) {
        if (option.getOpt().equals("I")) {
          String[] values = option.getValues();
          if (values.length < 2) {
            System.err.println("'I' parameter missing minimum args of (family path*)");
            return null;
          }
          for (String p : getSubArray(values, 1)) {
            Path path = new Path(p);
            CsvBlurMapper.addFamilyPath(job, values[0], path);
            FileInputFormat.addInputPath(job, path);
          }
        }
      }
    }

    if (cmd.hasOption("s")) {
      CsvBlurMapper.setSeparator(job, StringEscapeUtils.unescapeJava(cmd.getOptionValue("s")));
    }
    if (cmd.hasOption("o")) {
      BlurOutputFormat.setOptimizeInFlight(job, false);
    }
    if (cmd.hasOption("l")) {
      BlurOutputFormat.setIndexLocally(job, false);
    }
    if (cmd.hasOption("b")) {
      int maxDocumentBufferSize = Integer.parseInt(cmd.getOptionValue("b"));
      BlurOutputFormat.setMaxDocumentBufferSize(job, maxDocumentBufferSize);
    }
    // processing the 'd' option
    Option[] options = cmd.getOptions();
    for (Option option : options) {
      if (option.getOpt().equals("d")) {
        String[] values = option.getValues();
        if (values.length < 2) {
          System.err.println("'d' parameter missing minimum args of (family columname*)");
          return null;
        }
        CsvBlurMapper.addColumns(job, values[0], getSubArray(values, 1));
      }
    }
    BlurOutputFormat.setupJob(job, tableDescriptor);
    BlurMapReduceUtil.addDependencyJars(job.getConfiguration(), Splitter.class);
    if (cmd.hasOption("r")) {
      int reducerMultiplier = Integer.parseInt(cmd.getOptionValue("r"));
      BlurOutputFormat.setReducerMultiplier(job, reducerMultiplier);
    }
    final Path output;
    if (cmd.hasOption("out")) {
      output = new Path(cmd.getOptionValue("out"));
    } else {
      UserGroupInformation currentUser = UserGroupInformation.getCurrentUser();
      String userName = currentUser.getUserName();
      output = new Path("/user/" + userName + "/.blur-" + System.currentTimeMillis());
    }
    BlurOutputFormat.setOutputPath(job, output);
    if (cmd.hasOption("import")) {
      ref.set(new Callable<Void>() {
        @Override
        public Void call() throws Exception {
          client.loadData(tableName, output.toUri().toString());
          return null;
        }
      });
    }
    return job;
  }

  private static String[] getSubArray(String[] array, int starting) {
    String[] result = new String[array.length - starting];
    System.arraycopy(array, starting, result, 0, result.length);
    return result;
  }

  private static Set<Path> recurisvelyGetPathesContainingFiles(Path path, Configuration configuration)
      throws IOException {
    Set<Path> pathSet = new HashSet<Path>();
    FileSystem fileSystem = path.getFileSystem(configuration);
    if (!fileSystem.exists(path)) {
      LOG.warn("Path not found [{0}]", path);
      return pathSet;
    }
    FileStatus[] listStatus = fileSystem.listStatus(path);
    for (FileStatus status : listStatus) {
      if (status.isDir()) {
        pathSet.addAll(recurisvelyGetPathesContainingFiles(status.getPath(), configuration));
      } else {
        pathSet.add(status.getPath().getParent());
      }
    }
    return pathSet;
  }

  private static CommandLine parse(String... otherArgs) throws ParseException {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("controller*").hasArgs().isRequired(true)
        .withDescription("* Thrift controller connection string. (host1:40010 host2:40010 ...)").create("c"));
    options.addOption(OptionBuilder.withArgName("tablename").hasArg().isRequired(true)
        .withDescription("* Blur table name.").create("t"));
    options.addOption(OptionBuilder.withArgName("family column*").hasArgs().isRequired(true)
        .withDescription("* Define the mapping of fields in the CSV file to column names. (family col1 col2 col3 ...)")
        .create("d"));
    options.addOption(OptionBuilder
        .withArgName("delimiter")
        .hasArg()
        .withDescription(
            "The file delimiter to be used. (default value ',')  NOTE: For special "
                + "charactors like the default hadoop separator of ASCII value 1, you can use standard "
                + "java escaping (\\u0001)").create("s"));
    options
        .addOption(OptionBuilder
            .withArgName("path*")
            .hasArg()
            .withDescription(
                "The directory to index, the family name is assumed to BE present in the file contents. (hdfs://namenode/input/in1)")
            .create("i"));
    options
        .addOption(OptionBuilder
            .withArgName("family path*")
            .hasArgs()
            .withDescription(
                "The directory to index with a family name, the family name is assumed to NOT be present in the file contents. (family hdfs://namenode/input/in1)")
            .create("I"));
    options
        .addOption(OptionBuilder
            .withArgName("auto generate record ids")
            .withDescription(
                "No Record Ids - Automatically generate record ids for each record based on a MD5 has of the data within the record.")
            .create("a"));
    options
        .addOption(OptionBuilder
            .withArgName("auto generate row ids")
            .withDescription(
                "No Row Ids - Automatically generate row ids for each record based on a MD5 has of the data within the record.")
            .create("A"));
    options.addOption(OptionBuilder.withArgName("disable optimize indexes during copy")
        .withDescription("Disable optimize indexes during copy, this has very little overhead. (enabled by default)")
        .create("o"));
    options.addOption(OptionBuilder
        .withArgName("disable index locally")
        .withDescription(
            "Disable the use storage local on the server that is running the reducing "
                + "task and copy to Blur table once complete. (enabled by default)").create("l"));
    options.addOption(OptionBuilder.withArgName("sequence files inputs")
        .withDescription("The input files are sequence files.").create("S"));
    options.addOption(OptionBuilder
        .withArgName("size")
        .hasArg()
        .withDescription(
            "The maximum number of Lucene documents to buffer in the reducer for a single "
                + "row before spilling over to disk. (default 1000)").create("b"));
    options.addOption(OptionBuilder
        .withArgName("multiplier")
        .hasArg()
        .withDescription(
            "The reducer multipler allows for an increase in the number of reducers per "
                + "shard in the given table.  For example if the table has 128 shards and the "
                + "reducer multiplier is 4 the total number of reducers will be 512, 4 reducers "
                + "per shard. (default 1)").create("r"));
    options.addOption(OptionBuilder
        .withArgName("minimum maximum")
        .hasArgs(2)
        .withDescription(
            "Enables a combine file input to help deal with many small files as the input. Provide "
                + "the minimum and maximum size per mapper.  For a minimum of 1GB and a maximum of "
                + "2.5GB: (1000000000 2500000000)").create("C"));
    options.addOption(OptionBuilder
        .withArgName("codec")
        .hasArgs(1)
        .withDescription(
            "Sets the compression codec for the map compress output setting. (SNAPPY,GZIP,BZIP,DEFAULT, or classname)")
        .create("p"));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("Sets the output directory for the map reduce job before the indexes are loaded into Blur.")
        .create("out"));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("Imports the data into Blur after the map reduce job completes.")
        .create("import"));

    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;
    try {
      cmd = parser.parse(options, otherArgs);
    } catch (ParseException e) {
      System.err.println(e.getMessage());
      HelpFormatter formatter = new HelpFormatter();
      PrintWriter pw = new PrintWriter(System.err, true);
      formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
          HelpFormatter.DEFAULT_DESC_PAD, null, false);
      return null;
    }

    if (!(cmd.hasOption("I") || cmd.hasOption("i"))) {
      System.err.println("Missing input directory, see options 'i' and 'I'.");
      HelpFormatter formatter = new HelpFormatter();
      PrintWriter pw = new PrintWriter(System.err, true);
      formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
          HelpFormatter.DEFAULT_DESC_PAD, null, false);
      return null;
    }
    return cmd;
  }

  public static class CsvBlurCombineSequenceFileInputFormat extends CombineFileInputFormat<Writable, Text> {

    private static class SequenceFileRecordReaderWrapper extends RecordReader<Writable, Text> {

      private final RecordReader<Writable, Text> delegate;
      private final FileSplit fileSplit;

      @SuppressWarnings("unused")
      public SequenceFileRecordReaderWrapper(CombineFileSplit split, TaskAttemptContext context, Integer index)
          throws IOException {
        fileSplit = new FileSplit(split.getPath(index), split.getOffset(index), split.getLength(index),
            split.getLocations());
        delegate = new SequenceFileInputFormat<Writable, Text>().createRecordReader(fileSplit, context);
      }

      @Override
      public float getProgress() throws IOException, InterruptedException {
        return delegate.getProgress();
      }

      @Override
      public Writable getCurrentKey() throws IOException, InterruptedException {
        return delegate.getCurrentKey();
      }

      @Override
      public Text getCurrentValue() throws IOException, InterruptedException {
        return delegate.getCurrentValue();
      }

      @Override
      public void initialize(InputSplit arg0, TaskAttemptContext context) throws IOException, InterruptedException {
        delegate.initialize(fileSplit, context);
      }

      @Override
      public boolean nextKeyValue() throws IOException, InterruptedException {
        return delegate.nextKeyValue();
      }

      @Override
      public void close() throws IOException {
        delegate.close();
      }

    }

    @Override
    public RecordReader<Writable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException {
      return new CombineFileRecordReader<Writable, Text>((CombineFileSplit) split, context,
          SequenceFileRecordReaderWrapper.class);
    }
  }

}