LookupBuilderReducer.java example

Explorer
incubator-blur-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.blur.indexer.mapreduce;

import java.io.IOException;
import java.util.List;

import org.apache.blur.BlurConfiguration;
import org.apache.blur.indexer.BlurIndexCounter;
import org.apache.blur.indexer.IndexerJobDriver;
import org.apache.blur.indexer.MergeSortRowIdMatcher;
import org.apache.blur.indexer.MergeSortRowIdMatcher.Action;
import org.apache.blur.manager.BlurPartitioner;
import org.apache.blur.manager.writer.SnapshotIndexDeletionPolicy;
import org.apache.blur.mapreduce.lib.BlurInputFormat;
import org.apache.blur.mapreduce.lib.BlurOutputFormat;
import org.apache.blur.store.BlockCacheDirectoryFactoryV2;
import org.apache.blur.store.hdfs.HdfsDirectory;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.blur.utils.BlurConstants;
import org.apache.blur.utils.ShardUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.Directory;

import com.google.common.io.Closer;

public class LookupBuilderReducer extends Reducer<Text, NullWritable, Text, BooleanWritable> {

  public static final String BLUR_CACHE_DIR_TOTAL_BYTES = "blur.cache.dir.total.bytes";
  private Counter _rowIds;
  private Counter _rowIdsToUpdate;

  private MergeSortRowIdMatcher _matcher;
  private int _numberOfShardsInTable;
  private Configuration _configuration;
  private String _snapshot;
  private Path _tablePath;
  private Counter _rowIdsFromIndex;
  private long _totalNumberOfBytes;
  private Action _action;
  private Closer _closer;
  private Path _cachePath;
  private String _table;
  private Writer _writer;

  @Override
  protected void setup(Reducer<Text, NullWritable, Text, BooleanWritable>.Context context) throws IOException,
      InterruptedException {
    _configuration = context.getConfiguration();
    _rowIds = context.getCounter(BlurIndexCounter.ROW_IDS_FROM_NEW_DATA);
    _rowIdsToUpdate = context.getCounter(BlurIndexCounter.ROW_IDS_TO_UPDATE_FROM_NEW_DATA);
    _rowIdsFromIndex = context.getCounter(BlurIndexCounter.ROW_IDS_FROM_INDEX);
    TableDescriptor tableDescriptor = BlurOutputFormat.getTableDescriptor(_configuration);
    _numberOfShardsInTable = tableDescriptor.getShardCount();
    _tablePath = new Path(tableDescriptor.getTableUri());
    _snapshot = ExistingDataIndexLookupMapper.getSnapshot(_configuration);
    _totalNumberOfBytes = _configuration.getLong(BLUR_CACHE_DIR_TOTAL_BYTES, 128 * 1024 * 1024);
    _cachePath = BlurInputFormat.getLocalCachePath(_configuration);
    _table = tableDescriptor.getName();
    _closer = Closer.create();
  }

  @Override
  protected void reduce(Text rowId, Iterable<NullWritable> nothing,
      Reducer<Text, NullWritable, Text, BooleanWritable>.Context context) throws IOException, InterruptedException {
    if (_matcher == null) {
      _matcher = getMergeSortRowIdMatcher(rowId, context);
    }
    if (_writer == null) {
      _writer = getRowIdWriter(rowId, context);
    }
    _writer.append(rowId, NullWritable.get());
    _rowIds.increment(1);
    if (_action == null) {
      _action = new Action() {
        @Override
        public void found(Text rowId) throws IOException {
          _rowIdsToUpdate.increment(1);
          try {
            context.write(rowId, new BooleanWritable(true));
          } catch (InterruptedException e) {
            throw new IOException(e);
          }
        }
      };
    }
    _matcher.lookup(rowId, _action);
  }

  private Writer getRowIdWriter(Text rowId, Reducer<Text, NullWritable, Text, BooleanWritable>.Context context)
      throws IOException {
    BlurPartitioner blurPartitioner = new BlurPartitioner();
    int shard = blurPartitioner.getShard(rowId, _numberOfShardsInTable);
    String shardName = ShardUtil.getShardName(shard);
    Path cachePath = MergeSortRowIdMatcher.getCachePath(_cachePath, _table, shardName);
    Configuration configuration = context.getConfiguration();
    String uuid = configuration.get(IndexerJobDriver.BLUR_UPDATE_ID);
    Path tmpPath = new Path(cachePath, uuid + "_" + getAttemptString(context));
    return _closer.register(MergeSortRowIdMatcher.createWriter(_configuration, tmpPath));
  }

  private String getAttemptString(Reducer<Text, NullWritable, Text, BooleanWritable>.Context context) {
    TaskAttemptID taskAttemptID = context.getTaskAttemptID();
    return taskAttemptID.toString();
  }

  @Override
  protected void cleanup(Reducer<Text, NullWritable, Text, BooleanWritable>.Context context) throws IOException,
      InterruptedException {
    _closer.close();
  }

  private MergeSortRowIdMatcher getMergeSortRowIdMatcher(Text rowId,
      Reducer<Text, NullWritable, Text, BooleanWritable>.Context context) throws IOException {
    BlurPartitioner blurPartitioner = new BlurPartitioner();
    int shard = blurPartitioner.getShard(rowId, _numberOfShardsInTable);
    String shardName = ShardUtil.getShardName(shard);

    Path shardPath = new Path(_tablePath, shardName);
    HdfsDirectory hdfsDirectory = new HdfsDirectory(_configuration, shardPath);
    SnapshotIndexDeletionPolicy policy = new SnapshotIndexDeletionPolicy(_configuration,
        SnapshotIndexDeletionPolicy.getGenerationsPath(shardPath));
    Long generation = policy.getGeneration(_snapshot);
    if (generation == null) {
      hdfsDirectory.close();
      throw new IOException("Snapshot [" + _snapshot + "] not found in shard [" + shardPath + "]");
    }

    BlurConfiguration bc = new BlurConfiguration();
    BlockCacheDirectoryFactoryV2 blockCacheDirectoryFactoryV2 = new BlockCacheDirectoryFactoryV2(bc,
        _totalNumberOfBytes);
    _closer.register(blockCacheDirectoryFactoryV2);
    Directory dir = blockCacheDirectoryFactoryV2.newDirectory("table", "shard", hdfsDirectory, null);
    List<IndexCommit> listCommits = DirectoryReader.listCommits(dir);
    IndexCommit indexCommit = ExistingDataIndexLookupMapper.findIndexCommit(listCommits, generation, shardPath);
    DirectoryReader reader = DirectoryReader.open(indexCommit);
    _rowIdsFromIndex.setValue(getTotalNumberOfRowIds(reader));

    Path cachePath = MergeSortRowIdMatcher.getCachePath(_cachePath, _table, shardName);
    return new MergeSortRowIdMatcher(dir, generation, _configuration, cachePath, context);
  }

  private long getTotalNumberOfRowIds(DirectoryReader reader) throws IOException {
    long total = 0;
    List<AtomicReaderContext> leaves = reader.leaves();
    for (AtomicReaderContext context : leaves) {
      AtomicReader atomicReader = context.reader();
      Terms terms = atomicReader.terms(BlurConstants.ROW_ID);
      long expectedInsertions = terms.size();
      if (expectedInsertions < 0) {
        return -1;
      }
      total += expectedInsertions;
    }
    return total;
  }
}