BlurOutputFormat.java example

Explorer
incubator-blur-master
package org.apache.blur.mapreduce.lib;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.lang.reflect.Constructor;

import org.apache.blur.thirdparty.thrift_0_9_0.TException;
import org.apache.blur.thirdparty.thrift_0_9_0.protocol.TJSONProtocol;
import org.apache.blur.thirdparty.thrift_0_9_0.transport.TIOStreamTransport;
import org.apache.blur.thrift.BlurClient;
import org.apache.blur.thrift.generated.Blur.Iface;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.blur.utils.ThreadValue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.util.Progressable;

/**
 * {@link BlurOutputFormat} is used to index data and delivery the indexes to
 * the proper Blur table for searching. A typical usage of this class would be
 * as follows.<br/>
 * <br/>
 * 
 * <br/>
 * {@link Iface} client = {@link BlurClient}.getClient("controller1:40010");<br/>
 * <br/>
 * TableDescriptor tableDescriptor = client.describe(tableName);<br/>
 * <br/>
 * Job job = new Job(jobConf, "blur index");<br/>
 * job.setJarByClass(BlurOutputFormatTest.class);<br/>
 * job.setMapperClass(CsvBlurMapper.class);<br/>
 * job.setInputFormatClass(TextInputFormat.class);<br/>
 * <br/>
 * FileInputFormat.addInputPath(job, new Path(input));<br/>
 * CsvBlurMapper.addColumns(job, "cf1", "col");<br/>
 * <br/>
 * BlurOutputFormat.setupJob(job, tableDescriptor);<br/>
 * BlurOutputFormat.setIndexLocally(job, true);<br/>
 * BlurOutputFormat.setOptimizeInFlight(job, false);<br/>
 * <br/>
 * job.waitForCompletion(true);<br/>
 * 
 */
public class BlurOutputFormat extends OutputFormat<Text, BlurMutate> {

  public static final String BLUR_OUTPUT_REDUCER_MULTIPLIER = "blur.output.reducer.multiplier";
  public static final String BLUR_OUTPUT_OPTIMIZEINFLIGHT = "blur.output.optimizeinflight";
  public static final String BLUR_OUTPUT_INDEXLOCALLY = "blur.output.indexlocally";
  public static final String BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_SIZE = "blur.output.max.document.buffer.size";
  public static final String BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_HEAP_SIZE = "blur.output.max.document.buffer.heap.size";
  public static final String BLUR_OUTPUT_DOCUMENT_BUFFER_STRATEGY = "blur.output.document.buffer.strategy";
  public static final String BLUR_TABLE_DESCRIPTOR = "blur.table.descriptor";
  public static final String BLUR_OUTPUT_PATH = "blur.output.path";

  private static final String MAPRED_OUTPUT_COMMITTER_CLASS = "mapred.output.committer.class";
  private static ThreadValue<Progressable> _progressable = new ThreadValue<Progressable>();
  private static ThreadValue<GetCounter> _getCounter = new ThreadValue<GetCounter>();

  public static void setProgressable(Progressable progressable) {
    _progressable.set(progressable);
  }

  public static Progressable getProgressable() {
    return _progressable.get();
  }

  public static void setGetCounter(GetCounter getCounter) {
    _getCounter.set(getCounter);
  }

  public static GetCounter getGetCounter() {
    return _getCounter.get();
  }

  @Override
  public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
    CheckOutputSpecs.checkOutputSpecs(context.getConfiguration(), context.getNumReduceTasks());
  }

  @Override
  public RecordWriter<Text, BlurMutate> getRecordWriter(TaskAttemptContext context) throws IOException,
      InterruptedException {
    int id = context.getTaskAttemptID().getTaskID().getId();
    TaskAttemptID taskAttemptID = context.getTaskAttemptID();
    final GenericBlurRecordWriter writer = new GenericBlurRecordWriter(context.getConfiguration(), id,
        taskAttemptID.toString() + ".tmp");
    return new RecordWriter<Text, BlurMutate>() {

      @Override
      public void write(Text key, BlurMutate value) throws IOException, InterruptedException {
        writer.write(key, value);
      }

      @Override
      public void close(TaskAttemptContext context) throws IOException, InterruptedException {
        writer.close();
      }
    };
  }

  @Override
  public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
    return new BlurOutputCommitter();
  }

  public static TableDescriptor getTableDescriptor(Configuration configuration) throws IOException {
    String tableDesStr = configuration.get(BLUR_TABLE_DESCRIPTOR);
    if (tableDesStr == null) {
      return null;
    }
    ByteArrayInputStream inputStream = new ByteArrayInputStream(tableDesStr.getBytes());
    TIOStreamTransport transport = new TIOStreamTransport(inputStream);
    TJSONProtocol protocol = new TJSONProtocol(transport);
    TableDescriptor descriptor = new TableDescriptor();
    try {
      descriptor.read(protocol);
    } catch (TException e) {
      throw new IOException(e);
    }
    transport.close();
    return descriptor;
  }

  /**
   * This will multiple the number of reducers for this job. For example if the
   * table has 256 shards the normal number of reducers is 256. However if the
   * reducer multiplier is set to 4 then the number of reducers will be 1024 and
   * each shard will get 4 new segments instead of the normal 1.
   * 
   * @param job
   *          the job to setup.
   * @param multiple
   *          the multiple to use.
   * @throws IOException
   */
  public static void setReducerMultiplier(Job job, int multiple) throws IOException {
    TableDescriptor tableDescriptor = getTableDescriptor(job.getConfiguration());
    if (tableDescriptor == null) {
      throw new IOException("setTableDescriptor needs to be called first.");
    }
    job.setNumReduceTasks(tableDescriptor.getShardCount() * multiple);
    Configuration configuration = job.getConfiguration();
    configuration.setInt(BLUR_OUTPUT_REDUCER_MULTIPLIER, multiple);
  }

  public static int getReducerMultiplier(Configuration configuration) {
    return configuration.getInt(BLUR_OUTPUT_REDUCER_MULTIPLIER, 1);
  }

  /**
   * Sets the {@link TableDescriptor} for this job.
   * 
   * @param job
   *          the job to setup.
   * @param tableDescriptor
   *          the {@link TableDescriptor}.
   * @throws IOException
   */
  public static void setTableDescriptor(Job job, TableDescriptor tableDescriptor) throws IOException {
    setTableDescriptor(job.getConfiguration(), tableDescriptor);
  }

  /**
   * Sets the {@link TableDescriptor} for this job.
   * 
   * @param job
   *          the job to setup.
   * @param tableDescriptor
   *          the {@link TableDescriptor}.
   * @throws IOException
   */
  public static void setTableDescriptor(Configuration configuration, TableDescriptor tableDescriptor)
      throws IOException {
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    TIOStreamTransport transport = new TIOStreamTransport(outputStream);
    TJSONProtocol protocol = new TJSONProtocol(transport);
    try {
      tableDescriptor.write(protocol);
    } catch (TException e) {
      throw new IOException(e);
    }
    transport.close();
    configuration.set(BLUR_TABLE_DESCRIPTOR, new String(outputStream.toByteArray()));
  }

  /**
   * Sets the maximum number of documents that the buffer will hold in memory
   * before overflowing to disk. By default this is 1000 which will probably be
   * very low for most systems.
   * 
   * @param job
   *          the job to setup.
   * @param maxDocumentBufferSize
   *          the maxDocumentBufferSize.
   */
  public static void setMaxDocumentBufferSize(Job job, int maxDocumentBufferSize) {
    setMaxDocumentBufferSize(job.getConfiguration(), maxDocumentBufferSize);
  }

  /**
   * Sets the maximum number of documents that the buffer will hold in memory
   * before overflowing to disk. By default this is 1000 which will probably be
   * very low for most systems.
   * 
   * @param configuration
   *          the configuration to setup.
   * @param maxDocumentBufferSize
   *          the maxDocumentBufferSize.
   */
  public static void setMaxDocumentBufferSize(Configuration configuration, int maxDocumentBufferSize) {
    configuration.setInt(BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_SIZE, maxDocumentBufferSize);
  }

  public static int getMaxDocumentBufferSize(Configuration configuration) {
    return configuration.getInt(BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_SIZE, 1000);
  }

  public static int getMaxDocumentBufferHeapSize(Configuration configuration) {
    return configuration.getInt(BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_HEAP_SIZE, 32 * 1024 * 1024);
  }

  public static void setMaxDocumentBufferHeapSize(Configuration configuration, int maxDocumentBufferHeapSize) {
    configuration.setInt(BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_HEAP_SIZE, maxDocumentBufferHeapSize);
  }

  public static void setMaxDocumentBufferHeapSize(Job job, int maxDocumentBufferHeapSize) {
    setMaxDocumentBufferHeapSize(job.getConfiguration(), maxDocumentBufferHeapSize);
  }

  public static DocumentBufferStrategy getDocumentBufferStrategy(Configuration configuration) {
    Class<? extends DocumentBufferStrategy> clazz = configuration.getClass(BLUR_OUTPUT_DOCUMENT_BUFFER_STRATEGY, DocumentBufferStrategyFixedSize.class, DocumentBufferStrategy.class);
    try {
      Constructor<? extends DocumentBufferStrategy> constructor = clazz.getConstructor(new Class[]{Configuration.class});
      return constructor.newInstance(new Object[]{configuration});
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  
  public static void setDocumentBufferStrategy(Job job, Class<? extends DocumentBufferStrategy> documentBufferStrategyClass) {
    setDocumentBufferStrategy(job.getConfiguration(), documentBufferStrategyClass);
  }
  
  public static void setDocumentBufferStrategy(Configuration configuration, Class<? extends DocumentBufferStrategy> documentBufferStrategyClass) {
    configuration.setClass(BLUR_OUTPUT_DOCUMENT_BUFFER_STRATEGY, documentBufferStrategyClass, DocumentBufferStrategy.class);
  }

  public static void setOutputPath(Job job, Path path) {
    setOutputPath(job.getConfiguration(), path);
  }

  public static void setOutputPath(Configuration configuration, Path path) {
    configuration.set(BLUR_OUTPUT_PATH, path.toString());
    configuration.set(MAPRED_OUTPUT_COMMITTER_CLASS, BlurOutputCommitter.class.getName());
  }

  public static Path getOutputPath(Configuration configuration) {
    String pathString = configuration.get(BLUR_OUTPUT_PATH);
    if (pathString == null) {
      return null;
    }
    return new Path(pathString);
  }

  /**
   * Enabled by default, this will enable local indexing on the machine where
   * the task is running. Then when the {@link RecordWriter} closes the index is
   * copied to the remote destination in HDFS.
   * 
   * @param job
   *          the job to setup.
   * @param b
   *          the boolean to true enable, false to disable.
   */
  public static void setIndexLocally(Job job, boolean b) {
    setIndexLocally(job.getConfiguration(), b);
  }

  /**
   * Enabled by default, this will enable local indexing on the machine where
   * the task is running. Then when the {@link RecordWriter} closes the index is
   * copied to the remote destination in HDFS.
   * 
   * @param configuration
   *          the configuration to setup.
   * @param b
   *          the boolean to true enable, false to disable.
   */
  public static void setIndexLocally(Configuration configuration, boolean b) {
    configuration.setBoolean(BLUR_OUTPUT_INDEXLOCALLY, b);
  }

  public static boolean isIndexLocally(Configuration configuration) {
    return configuration.getBoolean(BLUR_OUTPUT_INDEXLOCALLY, true);
  }

  /**
   * Enabled by default, this will optimize the index while copying from the
   * local index to the remote destination in HDFS. Used in conjunction with the
   * setIndexLocally.
   * 
   * @param job
   *          the job to setup.
   * @param b
   *          the boolean to true enable, false to disable.
   */
  public static void setOptimizeInFlight(Job job, boolean b) {
    setOptimizeInFlight(job.getConfiguration(), b);
  }

  /**
   * Enabled by default, this will optimize the index while copying from the
   * local index to the remote destination in HDFS. Used in conjunction with the
   * setIndexLocally.
   * 
   * @param job
   *          the job to setup.
   * @param b
   *          the boolean to true enable, false to disable.
   */
  public static void setOptimizeInFlight(Configuration configuration, boolean b) {
    configuration.setBoolean(BLUR_OUTPUT_OPTIMIZEINFLIGHT, b);
  }

  public static boolean isOptimizeInFlight(Configuration configuration) {
    return configuration.getBoolean(BLUR_OUTPUT_OPTIMIZEINFLIGHT, true);
  }

  /**
   * Sets up the output portion of the map reduce job. This does effect the map
   * side of the job, of a map and reduce job.
   * 
   * @param job
   *          the job to setup.
   * @param tableDescriptor
   *          the table descriptor to write the output of the indexing job.
   * @throws IOException
   */
  public static void setupJob(Job job, TableDescriptor tableDescriptor) throws IOException {
    job.setReducerClass(DefaultBlurReducer.class);
    job.setNumReduceTasks(tableDescriptor.getShardCount());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BlurMutate.class);
    job.setOutputFormatClass(BlurOutputFormat.class);
    setTableDescriptor(job, tableDescriptor);
    BlurMapReduceUtil.addDependencyJars(job);
    BlurMapReduceUtil.addAllJarsInBlurLib(job.getConfiguration());
  }

}