DrillTextRecordReader.java example

Explorer
drill-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.store.text;

import java.io.IOException;
import java.util.Collections;
import java.util.List;

import javax.annotation.Nullable;

import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;

import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.FieldReference;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.OperatorContext;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.store.AbstractRecordReader;
import org.apache.drill.exec.vector.RepeatedVarCharVector;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

public class DrillTextRecordReader extends AbstractRecordReader {
  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DrillTextRecordReader.class);

  private static final String COL_NAME = "columns";

  private org.apache.hadoop.mapred.RecordReader<LongWritable, Text> reader;
  private final List<ValueVector> vectors = Lists.newArrayList();
  private byte delimiter;
  private FieldReference ref = new FieldReference(COL_NAME);
  private RepeatedVarCharVector vector;
  private List<Integer> columnIds = Lists.newArrayList();
  private LongWritable key;
  private Text value;
  private int numCols = 0;
  private FileSplit split;
  private long totalRecordsRead;

  public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context,
      char delimiter, List<SchemaPath> columns) {
    this.delimiter = (byte) delimiter;
    this.split = split;
    setColumns(columns);

    if (!isStarQuery()) {
      String pathStr;
      for (SchemaPath path : columns) {
        assert path.getRootSegment().isNamed();
        pathStr = path.getRootSegment().getPath();
        Preconditions.checkArgument(pathStr.equals(COL_NAME) || (pathStr.equals("*") && path.getRootSegment().getChild() == null),
            "Selected column(s) must have name 'columns' or must be plain '*'");

        if (path.getRootSegment().getChild() != null) {
          Preconditions.checkArgument(path.getRootSegment().getChild().isArray(), "Selected column must be an array index");
          int index = path.getRootSegment().getChild().getArraySegment().getIndex();
          columnIds.add(index);
        }
      }
      Collections.sort(columnIds);
      numCols = columnIds.size();
    }

    TextInputFormat inputFormat = new TextInputFormat();
    JobConf job = new JobConf(fsConf);
    job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
    job.setInputFormat(inputFormat.getClass());
    try {
      reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
      key = reader.createKey();
      value = reader.createValue();
      totalRecordsRead = 0;
    } catch (Exception e) {
      handleAndRaise("Failure in creating record reader", e);
    }
  }

  @Override
  protected List<SchemaPath> getDefaultColumnsToRead() {
    return DEFAULT_TEXT_COLS_TO_READ;
  }

  @Override
  public boolean isStarQuery() {
    return super.isStarQuery() || Iterables.tryFind(getColumns(), new Predicate<SchemaPath>() {
      private final SchemaPath COLUMNS = SchemaPath.getSimplePath("columns");
      @Override
      public boolean apply(@Nullable SchemaPath path) {
        return path.equals(COLUMNS);
      }
    }).isPresent();
  }

  @Override
  public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
    MaterializedField field = MaterializedField.create(ref.getAsNamePart().getName(), Types.repeated(TypeProtos.MinorType.VARCHAR));
    try {
      vector = output.addField(field, RepeatedVarCharVector.class);
    } catch (Exception e) {
      handleAndRaise("Failure in setting up reader", e);
    }
  }

  protected void handleAndRaise(String s, Exception e) {
    String message = "Error in text record reader.\nMessage: " + s +
      "\nSplit information:\n\tPath: " + split.getPath() +
      "\n\tStart: " + split.getStart() +
      "\n\tLength: " + split.getLength();
    throw new DrillRuntimeException(message, e);
  }

  @Override
  public int next() {
//    logger.debug("vector value capacity {}", vector.getValueCapacity());
//    logger.debug("vector byte capacity {}", vector.getByteCapacity());
    int batchSize = 0;
    try {
      int recordCount = 0;
      final RepeatedVarCharVector.Mutator mutator = vector.getMutator();
      while (recordCount < Character.MAX_VALUE && batchSize < 200*1000 && reader.next(key, value)) {
        int start;
        int end = -1;

        // index of the scanned field
        int p = 0;
        int i = 0;
        mutator.startNewValue(recordCount);
        // Process each field in this line
        while (end < value.getLength() - 1) {
          if(numCols > 0 && p >= numCols) {
            break;
          }
          start = end;
          if (delimiter == '\n') {
            end = value.getLength();
          } else {
            end = find(value, delimiter, start + 1);
            if (end == -1) {
              end = value.getLength();
            }
          }
          if (numCols > 0 && i++ < columnIds.get(p)) {
            mutator.addSafe(recordCount, value.getBytes(), start + 1, 0);
            continue;
          }
          p++;
          mutator.addSafe(recordCount, value.getBytes(), start + 1, end - start - 1);
          batchSize += end - start;
        }
        recordCount++;
        totalRecordsRead++;
      }
      for (final ValueVector v : vectors) {
        v.getMutator().setValueCount(recordCount);
      }
      mutator.setValueCount(recordCount);
      // logger.debug("text scan batch size {}", batchSize);
      return recordCount;
    } catch(Exception e) {
      close();
      handleAndRaise("Failure while parsing text. Parser was at record: " + (totalRecordsRead + 1), e);
    }

    // this is never reached
    return 0;
  }

  /**
   * Returns the index within the text of the first occurrence of delimiter, starting the search at the specified index.
   *
   * @param  text  the text being searched
   * @param  delimiter the delimiter
   * @param  start the index to start searching
   * @return      the first occurrence of delimiter, starting the search at the specified index
   */
  public int find(Text text, byte delimiter, int start) {
    int len = text.getLength();
    int p = start;
    byte[] bytes = text.getBytes();
    boolean inQuotes = false;
    while (p < len) {
      if ('\"' == bytes[p]) {
        inQuotes = !inQuotes;
      }
      if (!inQuotes && bytes[p] == delimiter) {
        return p;
      }
      p++;
    }
    return -1;
  }

  @Override
  public void close() {
    try {
      if (reader != null) {
        reader.close();
        reader = null;
      }
    } catch (IOException e) {
      logger.warn("Exception closing reader: {}", e);
    }
  }
}