SequenceFileRecordReader.java example

Explorer
drill-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.store.easy.sequencefile;

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.concurrent.TimeUnit;
import java.security.PrivilegedExceptionAction;

import com.google.common.base.Stopwatch;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.ops.OperatorContext;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.store.AbstractRecordReader;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.exec.store.dfs.DrillFileSystem;
import org.apache.drill.exec.util.ImpersonationUtil;
import org.apache.drill.exec.vector.NullableVarBinaryVector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat;
import org.apache.hadoop.security.UserGroupInformation;


public class SequenceFileRecordReader extends AbstractRecordReader {
  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(SequenceFileRecordReader.class);

  private static final int PER_BATCH_RECORD_COUNT = 4096;
  private static final int PER_BATCH_BYTES = 256*1024;

  private static final MajorType KEY_TYPE = Types.optional(TypeProtos.MinorType.VARBINARY);
  private static final MajorType VALUE_TYPE = Types.optional(TypeProtos.MinorType.VARBINARY);

  private final String keySchema = "binary_key";
  private final String valueSchema = "binary_value";

  private NullableVarBinaryVector keyVector;
  private NullableVarBinaryVector valueVector;
  private final FileSplit split;
  private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> reader;
  private final BytesWritable key = new BytesWritable();
  private final BytesWritable value = new BytesWritable();
  private final DrillFileSystem dfs;
  private final String queryUserName;
  private final String opUserName;

  public SequenceFileRecordReader(final FileSplit split,
                                  final DrillFileSystem dfs,
                                  final String queryUserName,
                                  final String opUserName) {
    final List<SchemaPath> columns = new ArrayList<>();
    columns.add(SchemaPath.getSimplePath(keySchema));
    columns.add(SchemaPath.getSimplePath(valueSchema));
    setColumns(columns);
    this.dfs = dfs;
    this.split = split;
    this.queryUserName = queryUserName;
    this.opUserName = opUserName;
  }

  @Override
  protected boolean isSkipQuery() {
    return false;
  }

  private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader(
    final InputFormat<BytesWritable, BytesWritable> inputFormat,
    final JobConf jobConf) throws ExecutionSetupException {
    try {
      final UserGroupInformation ugi = ImpersonationUtil.createProxyUgi(this.opUserName, this.queryUserName);
      return ugi.doAs(new PrivilegedExceptionAction<org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable>>() {
        @Override
        public org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> run() throws Exception {
          return inputFormat.getRecordReader(split, jobConf, Reporter.NULL);
        }
      });
    } catch (IOException | InterruptedException e) {
      throw new ExecutionSetupException(
        String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d",
          split.getPath(), split.getStart(), split.getLength()), e);
    }
  }

  @Override
  public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
    final SequenceFileAsBinaryInputFormat inputFormat = new SequenceFileAsBinaryInputFormat();
    final JobConf jobConf = new JobConf(dfs.getConf());
    jobConf.setInputFormat(inputFormat.getClass());
    reader = getRecordReader(inputFormat, jobConf);
    final MaterializedField keyField = MaterializedField.create(keySchema, KEY_TYPE);
    final MaterializedField valueField = MaterializedField.create(valueSchema, VALUE_TYPE);
    try {
      keyVector = output.addField(keyField, NullableVarBinaryVector.class);
      valueVector = output.addField(valueField, NullableVarBinaryVector.class);
    } catch (SchemaChangeException sce) {
      throw new ExecutionSetupException("Error in setting up sequencefile reader.", sce);
    }
  }

  @Override
  public int next() {
    final Stopwatch watch = Stopwatch.createStarted();
    if (keyVector != null) {
      keyVector.clear();
      keyVector.allocateNew();
    }
    if (valueVector != null) {
      valueVector.clear();
      valueVector.allocateNew();
    }
    int recordCount = 0;
    int batchSize = 0;
    try {
      while (recordCount < PER_BATCH_RECORD_COUNT && batchSize < PER_BATCH_BYTES && reader.next(key, value)) {
        keyVector.getMutator().setSafe(recordCount, key.getBytes(), 0, key.getLength());
        valueVector.getMutator().setSafe(recordCount, value.getBytes(), 0, value.getLength());
        batchSize += (key.getLength() + value.getLength());
        ++recordCount;
      }
      keyVector.getMutator().setValueCount(recordCount);
      valueVector.getMutator().setValueCount(recordCount);
      logger.debug("Read {} records in {} ms", recordCount, watch.elapsed(TimeUnit.MILLISECONDS));
      return recordCount;
    } catch (IOException ioe) {
      close();
      throw UserException.dataReadError(ioe).addContext("File Path", split.getPath().toString()).build(logger);
    }
  }

  @Override
  public void close() {
    try {
      if (reader != null) {
        reader.close();
        reader = null;
      }
    } catch (IOException e) {
      logger.warn("Exception closing reader: {}", e);
    }
  }
}