LuceneIndexInputFormat.java example

Explorer

elephant-bird-master
- cascading-protobuf
  - src
    - main
      - java
        com
        twitter
        elephantbird
        cascading
        protobuf
        ProtobufComparator.java
        ProtobufDeserializer.java
        ProtobufReflectionUtil.java
        ProtobufSerialization.java
        ProtobufSerializer.java
- cascading2
  - src
    - main
      - java
        com
        twitter
        elephantbird
        cascading2
        scheme
        CombinedSequenceFile.java
        CombinedWritableSequenceFile.java
        LzoBinaryScheme.java
        LzoByteArrayScheme.java
        LzoProtobufB64LineScheme.java
        LzoProtobufBlockScheme.java
        LzoProtobufScheme.java
        LzoTextDelimited.java
        LzoTextLine.java
        LzoThriftB64LineScheme.java
        LzoThriftScheme.java
    - test
      - java
        com
        twitter
        elephantbird
        cascading2
        scheme
        TestCombinedSequenceFile.java
- cascading3
  - src
    - main
      - java
        com
        twitter
        elephantbird
        cascading3
        scheme
        CombinedSequenceFile.java
        LzoBinaryScheme.java
        LzoByteArrayScheme.java
        LzoProtobufScheme.java
        LzoTextDelimited.java
        LzoTextLine.java
        LzoThriftScheme.java
    - test
      - java
        com
        twitter
        elephantbird
        cascading3
        scheme
        TestCombinedSequenceFile.java
- core
- crunch
  - src
    - main
      - java
        com
        twitter
        elephantbird
        crunch
        CrunchElephantBirdExample.java
        EBTypes.java
        LzoProtobufSource.java
        LzoProtobufSourceTarget.java
        LzoProtobufTarget.java
        LzoThriftSource.java
        LzoThriftSourceTarget.java
        LzoThriftTarget.java
        ProtobufFileReaderFactory.java
        ProtobufReadableData.java
        ThriftFileReaderFactory.java
        ThriftReadableData.java
- examples
  - src
    - main
      - java
        com
        twitter
        elephantbird
        examples
        DeprecatedWrapperWordCount.java
        LzoJsonWordCount.java
        LzoWordCount.java
        PhoneNumberCounter.java
        ProtobufMRExample.java
        ThriftMRExample.java
- hadoop-compat
  - src
    - main
      - java
        com
        twitter
        elephantbird
        util
        HadoopCompat.java
- hive
  - src
    - main
      - java
        com
        twitter
        elephantbird
        hive
        serde
        LzoProtobufHiveSerde.java
        ProtobufDeserializer.java
        ProtobufStructObjectInspector.java
        ThriftSerDe.java
        mapred
        input
        HiveMultiInputFormat.java
    - test
      - java
        com
        twitter
        elephantbird
        hive
        serde
        ProtobufDeserializerTest.java
- lucene
  - src
    - main
      - java
        com
        twitter
        elephantbird
        lucene
        HdfsMergeTool.java
        mapreduce
        input
        LuceneHdfsDirectory.java
        LuceneIndexCollectAllRecordReader.java
        LuceneIndexCountHitsRecordReader.java
        LuceneIndexInputFormat.java
        LuceneIndexRecordReader.java
        output
        LuceneIndexOutputFormat.java
    - test
      - java
        com
        twitter
        elephantbird
        mapreduce
        LuceneIndexingIntegrationTest.java
        input
        TestLuceneIndexInputFormat.java
        TestLuceneIndexRecordReader.java
- mahout
  - src
    - main
      - java
        com
        twitter
        elephantbird
        pig
        mahout
        VectorWritableConverter.java
    - test
      - java
        com
        twitter
        elephantbird
        pig
        mahout
        TestDenseVectorWritableConverter.java
        TestSequentialAccessSparseVectorWritableConverter.java
- pig
  - src
    - main
      - java
        com
        twitter
        elephantbird
        pig
        load
        FilterLoadFunc.java
        HBaseLoader.java
        JsonLoader.java
        LocationAsTuple.java
        LzoBaseLoadFunc.java
        LzoBaseRegexLoader.java
        LzoJsonLoader.java
        LzoProtobufB64LinePigLoader.java
        LzoProtobufBlockPigLoader.java
        LzoRawBytesLoader.java
        LzoRegexLoader.java
        LzoTextLoader.java
        LzoThriftB64LinePigLoader.java
        LzoThriftBlockPigLoader.java
        LzoTokenizedLoader.java
        LzoW3CLogLoader.java
        MultiFormatLoader.java
        ProtobufPigLoader.java
        SequenceFileLoader.java
        ThriftPigLoader.java
        piggybank
        BytesToThriftTuple.java
        GenericInvoker.java
        InvokeForDouble.java
        InvokeForFloat.java
        InvokeForInt.java
        InvokeForLong.java
        InvokeForString.java
        Invoker.java
        JsonStringToMap.java
        ProtobufBytesToTuple.java
        ThriftBytesToTuple.java
        store
        BaseStoreFunc.java
        Bz2PigStorage.java
        LzoJsonStorage.java
        LzoPigStorage.java
        LzoProtobufB64LinePigStorage.java
        LzoProtobufBlockPigStorage.java
        LzoRawBytesStorage.java
        LzoThriftB64LinePigStorage.java
        LzoThriftBlockPigStorage.java
        LzoTokenizedStorage.java
        SequenceFileStorage.java
        util
        AbstractLazyTuple.java
        AbstractWritableConverter.java
        BytesWritableConverter.java
        GenericWritableConverter.java
        IntWritableConverter.java
        LazyThriftWritableConverter.java
        LoadFuncTupleIterator.java
        LongWritableConverter.java
        LzoBufferedPositionedInputStream.java
        NullWritableConverter.java
        PigCounterHelper.java
        PigToProtobuf.java
        PigToThrift.java
        PigTokenHelper.java
        PigUtil.java
        ProjectedProtobufTupleFactory.java
        ProjectedThriftTupleFactory.java
        ProtobufToPig.java
        ProtobufTuple.java
        ProtobufWritableConverter.java
        ResourceSchemaUtil.java
        SequenceFileConfig.java
        TextConverter.java
        ThriftToPig.java
        ThriftWritableConverter.java
        WritableConverter.java
        WritableLoadCaster.java
        WritableStoreCaster.java
    - test
      - java
        com
        twitter
        elephantbird
        pig
        load
        TestBinaryLoaderWithManySplits.java
        TestErrorsInInput.java
        TestJsonLoader.java
        TestLocationAsTuple.java
        TestLzoTextLoader.java
        TestProtobufMultiFormatLoader.java
        TestThriftMultiFormatLoader.java
        piggybank
        Fixtures.java
        TestInvoker.java
        TestJsonStringToMap.java
        TestPigToProto.java
        TestProtoToPig.java
        TimeProtoConversions.java
        store
        FixedArgsConstructorIntWritableConverter.java
        TestLzoRawBytesStorage.java
        TestSequenceFileStorage.java
        VarArgsConstructorIntWritableConverter.java
        util
        AbstractTestProtobufWritableConverter.java
        AbstractTestThriftNameWritableConverter.java
        AbstractTestThriftWritableConverter.java
        AbstractTestWritableConverter.java
        IntegrationTestIntWritableConverter.java
        IntegrationTestLongWritableConverter.java
        IntegrationTestTextConverter.java
        PigTestUtil.java
        TestLoadFuncTupleIterator.java
        TestPigToProtobuf.java
        TestPigToThrift.java
        TestProtobufWritableConverter.java
        TestThriftNameWritableConverter.java
        TestThriftNameWritableConverterCustom.java
        TestThriftToPig.java
        ThriftNameWritable.java
        util
        TestProtobufs.java
        TestW3CLogParser.java
- pig-lucene
  - src
    - main
      - java
        com
        twitter
        elephantbird
        pig
        load
        LuceneIndexLoader.java
        store
        LuceneIndexStorage.java
    - test
      - java
        com
        twitter
        elephantbird
        pig
        PigLuceneIndexingIntegrationTest.java
        load
        TestLuceneIndexLoader.java
- rcfile
  - src
    - main
      - java
        com
        twitter
        elephantbird
        mapreduce
        input
        RCFileBaseInputFormat.java
        RCFileProtobufInputFormat.java
        RCFileProtobufTupleInputFormat.java
        RCFileThriftInputFormat.java
        RCFileThriftTupleInputFormat.java
        output
        RCFileOutputFormat.java
        RCFileProtobufOutputFormat.java
        RCFileThriftOutputFormat.java
        pig
        load
        RCFileProtobufPigLoader.java
        RCFileThriftPigLoader.java
        store
        RCFilePigStorage.java
        RCFileProtobufPigStorage.java
        RCFileThriftPigStorage.java
        util
        ColumnarMetadata.java
        RCFileUtil.java
    - test
      - java
        com
        twitter
        elephantbird
        pig
        load
        TestRCFilePigStorage.java
        TestRCFileProtobufStorage.java
        TestRCFileThriftStorage.java

package com.twitter.elephantbird.mapreduce.input;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.PriorityQueue;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import com.twitter.elephantbird.util.HadoopCompat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;

import com.twitter.elephantbird.mapreduce.output.LuceneIndexOutputFormat;
import com.twitter.elephantbird.util.HadoopUtils;
import com.twitter.elephantbird.util.HdfsUtils;

/**
 * Base class for input formats that read lucene indexes stored in HDFS directories.
 * Given a list of indexes and queries, runs each query over each index. Implements split
 * combining (combines multiple indexes into one split) based on
 * the total size of the index directory and the configured max combined split size.
 * <p>
 * Emits key, value records where key is the query that resulted in value
 * (key is actually the position in the list of queries, not the query string itself)
 * <p>
 * Subclasses must provide:
 * <ul>
 *  <li>a {@link LuceneIndexRecordReader} which describes how to convert a String into a
 *      Query and how to convert a Document into a value of type T</li>
 * </ul>
 * Subclasses may provide:d
 * <ul>
 *   <li> a {@link PathFilter} for identifying HDFS directories that contain Lucene indexes</li>
 * </ul>
 *
 * @param <T> - the type that your lucene Documents will be converted to
 * @author Alex Levenson
 */
public abstract class LuceneIndexInputFormat<T extends Writable>
    extends InputFormat<IntWritable, T> {

  public static final String QUERIES_KEY = LuceneIndexInputFormat.class.getCanonicalName()
    + ".queries";

  public static final String INPUT_PATHS_KEY = LuceneIndexInputFormat.class.getCanonicalName()
    + ".inputpaths";

  public static final String MAX_NUM_INDEXES_PER_SPLIT_KEY =
    LuceneIndexInputFormat.class.getCanonicalName() + ".max_num_indexes_per_split";

  // Empirically it seems that 200 is a reasonable number of small
  // indexes for one mapper to process in a few minutes, and thousands of
  // of small indexes can take one mapper more than 30 minutes to process
  // due to overhead for each index.
  private static final long DEFAULT_MAX_NUM_INDEXES_PER_SPLIT = 200;

  public static final String MAX_COMBINED_INDEX_SIZE_PER_SPLIT_KEY =
    LuceneIndexInputFormat.class.getCanonicalName() + ".max_combined_index_size_per_split";

  // default to 10GB
  // back of the envelope reasoning:
  // Assume 1 mapper should process 1 GB, and each index will return 1/10th its size in records
  private static final long DEFAULT_MAX_COMBINED_INDEX_SIZE_PER_SPLIT = 10*1024*1024*1024L;

  private static final String[] EMPTY_NODE_ARRAY = new String[0];

  private Path[] inputPaths = null;
  private PathFilter indexDirPathFilter = null;
  private long maxCombinedIndexSizePerSplit;
  private long maxNumIndexesPerSplit;

  /**
   * Subclasses may provide a {@link PathFilter} for identifying HDFS
   * directories that contain lucene indexes. When directories are being
   * searched recursively for index directories, this path filter will be used
   * to determine if a directory is a lucene index.
   * <p>
   * The default is to treat any directory whose name begins with "-index" as a lucene index,
   * which matches what {@link LuceneIndexOutputFormat} generates.
   *
   * @param conf job conf
   * @return a path filter that accepts directories with lucene indexes in them
   * @throws IOException
   */
  public PathFilter getIndexDirPathFilter(Configuration conf) throws IOException {
    return LuceneIndexOutputFormat.newIndexDirFilter(conf);
  }

  @VisibleForTesting
  void loadConfig(Configuration conf) throws IOException {
    inputPaths = getInputPaths(conf);

    indexDirPathFilter = Preconditions.checkNotNull(getIndexDirPathFilter(conf),
      "You must provide a non-null PathFilter");

    maxCombinedIndexSizePerSplit = Preconditions.checkNotNull(
        getMaxCombinedIndexSizePerSplit(conf),
        MAX_COMBINED_INDEX_SIZE_PER_SPLIT_KEY + " cannot be null");

    maxNumIndexesPerSplit = Preconditions.checkNotNull(getMaxNumIndexesPerSplit(conf),
        MAX_NUM_INDEXES_PER_SPLIT_KEY + " cannot be null");
  }

  /**
   * Creates splits with multiple indexes per split
   * (if they are smaller than maxCombinedIndexSizePerSplit).
   * It is possible for a split to be larger than maxCombinedIndexSizePerSplit,
   * if it consists of a single index that is
   * larger than maxCombinedIndexSizePerSplit.
   * <p>
   * All inputPaths will be searched for indexes recursively
   * <p>
   * The bin-packing problem of combining splits is solved naively:
   * <ol>
   *   <li>Sort all indexes by size</li>
   *   <li>Begin packing indexes into splits until adding the next split would cause the split to
   *       exceed maxCombinedIndexSizePerSplit</li>
   *   <li>Begin packing subsequent indexes into the next split, and so on</li>
   * </ol>
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {

    // load settings from job conf
    loadConfig(HadoopCompat.getConfiguration(job));

    // find all the index dirs and create a split for each
    PriorityQueue<LuceneIndexInputSplit> splits = findSplits(HadoopCompat.getConfiguration(job));

    // combine the splits based on maxCombineSplitSize
    List<InputSplit> combinedSplits = combineSplits(splits, maxCombinedIndexSizePerSplit,
      maxNumIndexesPerSplit);

    return combinedSplits;
  }

  /**
   * Finds and creates all the index splits based on the input paths set in conf
   * @param conf job conf
   * @return a priority queue of the splits, default is sorted by size
   * @throws IOException
   */
  protected PriorityQueue<LuceneIndexInputSplit> findSplits(Configuration conf) throws IOException {
    PriorityQueue<LuceneIndexInputSplit> splits = new PriorityQueue<LuceneIndexInputSplit>();
    List<Path> indexDirs = Lists.newLinkedList();

    // find all indexes nested under all the input paths
    // (which happen to be directories themselves)
    for (Path path : inputPaths) {
      HdfsUtils.collectPaths(path, path.getFileSystem(conf), indexDirPathFilter, indexDirs);
    }

    // compute the size of each index
    // and create a single split per index
    for (Path indexDir : indexDirs) {
      long size = HdfsUtils.getDirectorySize(indexDir, indexDir.getFileSystem(conf));
      splits.add(new LuceneIndexInputSplit(Lists.newLinkedList(Arrays.asList(indexDir)), size));
    }
    return splits;
  }

  protected List<InputSplit> combineSplits(PriorityQueue<LuceneIndexInputSplit> splits,
                                           long maxCombinedIndexSizePerSplit,
                                           long maxNumIndexesPerSplit) {

    // now take the one-split-per-index splits and combine them into multi-index-per-split splits
    List<InputSplit> combinedSplits = Lists.newLinkedList();
    LuceneIndexInputSplit currentSplit = splits.poll();
    while (currentSplit != null) {
      while (currentSplit.getLength() < maxCombinedIndexSizePerSplit) {
        LuceneIndexInputSplit nextSplit = splits.peek();
        if (nextSplit == null) {
          break;
        }
        if (currentSplit.getLength() + nextSplit.getLength() > maxCombinedIndexSizePerSplit) {
          break;
        }
        if (currentSplit.getIndexDirs().size() >= maxNumIndexesPerSplit) {
          break;
        }
        currentSplit.combine(nextSplit);
        splits.poll();
      }
      combinedSplits.add(currentSplit);
      currentSplit = splits.poll();
    }
    return combinedSplits;
  }

  /**
   * Set the queries to run over the indexes
   * These can be Strings suitable for parsing with a QueryParser, or they can be
   * a custom serialized Query object. They have to be Strings so that they can be
   * written to the job conf. They will be deserialized / parsed by the abstract
   * method {@link LuceneIndexRecordReader#deserializeQuery(String)}
   *
   * @param queries queries to run over the indexes
   * @param conf job conf
   * @throws IOException
   */
  public static void setQueries(List<String> queries, Configuration conf) throws IOException {
    Preconditions.checkNotNull(queries);
    Preconditions.checkArgument(!queries.isEmpty());
    HadoopUtils.writeStringListToConfAsBase64(QUERIES_KEY, queries, conf);
  }

  /**
   * Get the queries to run over the indexes
   * These are the queries in the same form as they were passed to {@link #setQueries} above
   *
   * @param conf job conf
   * @return queries as passed to {@link #setQueries}
   * @throws IOException
   */
  public static List<String> getQueries(Configuration conf) throws IOException {
    return Preconditions.checkNotNull(HadoopUtils.readStringListFromConfAsBase64(QUERIES_KEY, conf),
      "You must call LuceneIndexInputFormat.setQueries()");
  }

  /**
   * Check whether queries have been set yet for this job, useful for lazy loading
   * queries into the config
   *
   * @param conf job conf
   * @return whether the queries have been set yet
   */
  public static boolean queriesSet(Configuration conf) {
    return conf.get(QUERIES_KEY) != null;
  }

  /**
   * Sets the input paths for for this input format.
   * All paths will be searched for indexes recursively
   *
   * @param paths the input paths
   * @param conf the job conf
   * @throws IOException
   */
  public static void setInputPaths(List<Path> paths, Configuration conf) throws IOException {
    Preconditions.checkNotNull(paths);
    Preconditions.checkArgument(!paths.isEmpty());
    String[] pathStrs = new String[paths.size()];
    int i = 0;
    for (Path p : paths) {
      FileSystem fs = p.getFileSystem(conf);
      pathStrs[i++] = fs.makeQualified(p).toString();
    }
    conf.setStrings(INPUT_PATHS_KEY, pathStrs);
  }

  /**
   * Gets the input paths for this input format
   *
   * @param conf the job conf
   */
  public static Path[] getInputPaths(Configuration conf) {
    String[] pathStrs = Preconditions.checkNotNull(conf.getStrings(INPUT_PATHS_KEY),
        "You must call LuceneIndexInputFormat.setInputPaths()");
    Path[] paths = new Path[pathStrs.length];
    for (int i = 0; i < pathStrs.length; i++) {
      paths[i] = new Path(pathStrs[i]);
    }
    return paths;
  }

  /**
   * Set the max combined size of indexes to be processed by one split.
   *
   * If an index is larger than size than it will be put in its own split, but all splits
   * containing multiple indexes will have a combined size <= size.
   *
   * @param size the max combined size of indexes to be processed by one split in bytes
   * @param conf job conf
   */
  public static void setMaxCombinedIndexSizePerSplitBytes(long size, Configuration conf) {
    conf.setLong(MAX_COMBINED_INDEX_SIZE_PER_SPLIT_KEY, size);
  }

  /**
   * Get the max size of a combined split in bytes
   * @param conf job conf
   * @return the max size of a combined split in bytes
   */
  public static long getMaxCombinedIndexSizePerSplit(Configuration conf) {
    return conf.getLong(MAX_COMBINED_INDEX_SIZE_PER_SPLIT_KEY,
      DEFAULT_MAX_COMBINED_INDEX_SIZE_PER_SPLIT);
  }

  /**
   * Set the max number of indexes to process for a single split.
   *
   * If a combined split still has room for more indexes (as determined by
   * {@link #getMaxCombinedIndexSizePerSplit}) then more indexes will be added to it
   * UNLESS that would cause the split to have more than num indexes combined into it.
   *
   * This helps preventing one split from getting thousands of small indexes which can make it
   * significantly slower than the others.
   *
   * @param num max number of splits per combined split
   * @param conf job conf
   */
  public static void setMaxNumIndexesPerSplit(long num, Configuration conf) {
    conf.setLong(MAX_NUM_INDEXES_PER_SPLIT_KEY, num);
  }

  /**
   * Get the max number of indexes per split
   *
   * @param conf job conf
   * @return the max number of indexes per split
   */
  public static long getMaxNumIndexesPerSplit(Configuration conf) {
    return conf.getLong(MAX_NUM_INDEXES_PER_SPLIT_KEY, DEFAULT_MAX_NUM_INDEXES_PER_SPLIT);
  }

  /**
   * A split that represents multiple index (directories).
   * Has to be {@link Writable} to work with pig
   */
  public static class LuceneIndexInputSplit extends InputSplit
                                            implements Writable, Comparable<LuceneIndexInputSplit> {
    private List<Path> indexDirs;
    private Long length;

    /**
     * Required for instantiation by reflection
     */
    public LuceneIndexInputSplit() { }

    /**
     * Constructor for this class.
     * @param indexDirs a {@link java.util.List} of directories
     * containing existing Lucene indexes.
     * @param length the size of the split in bytes
     */
    public LuceneIndexInputSplit(List<Path> indexDirs, long length) {
      this.indexDirs = indexDirs;
      this.length = length;
    }

    /**
     * Merge other into this split
     * Will have no effect on other
     * @param other the split to combine
     */
    public void combine(LuceneIndexInputSplit other) {
      indexDirs.addAll(other.getIndexDirs());
      length += other.getLength();
    }

    /**
     * Get the size of this split in bytes
     * @return the size of this split in bytes
     */
    @Override
    public long getLength() {
      return length;
    }

    /**
     * Because an index consists of multiple (multi-block) files there's not much to be gained from
     * finding nodes where there is locality
     * @return an empty String[]
     */
    @Override
    public String[] getLocations() throws IOException, InterruptedException {
      return EMPTY_NODE_ARRAY;
    }

    /**
     * @return the list of indexes in this split (which are directories)
     */
    public List<Path> getIndexDirs() {
      return indexDirs;
    }

    @Override
    public void write(DataOutput out) throws IOException {
      out.writeLong(length);
      out.writeInt(indexDirs.size());
      for(Path p : indexDirs) {
        Text.writeString(out, p.toString());
      }
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      this.length = in.readLong();
      int numDirs = in.readInt();
      this.indexDirs = Lists.newLinkedList();
      for (int i = 0; i < numDirs; i++) {
        String path = Text.readString(in);
        this.indexDirs.add(new Path(path));
      }
    }

    /**
     * sorts by length (size in bytes)
     */
    @Override
    public int compareTo(LuceneIndexInputSplit other) {
      return length.compareTo(other.getLength());
    }

    /**
     * Prints the directories and the combined length/size of the split in bytes.
     */
    @Override
    public String toString() {
      return "LuceneIndexInputSplit<indexDirs:" + indexDirs.toString() + " length:" + length + ">";
    }
  }
}