/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.text;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import javax.annotation.Nullable;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.FieldReference;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.OperatorContext;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.store.AbstractRecordReader;
import org.apache.drill.exec.vector.RepeatedVarCharVector;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
public class DrillTextRecordReader extends AbstractRecordReader {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DrillTextRecordReader.class);
private static final String COL_NAME = "columns";
private org.apache.hadoop.mapred.RecordReader<LongWritable, Text> reader;
private final List<ValueVector> vectors = Lists.newArrayList();
private byte delimiter;
private FieldReference ref = new FieldReference(COL_NAME);
private RepeatedVarCharVector vector;
private List<Integer> columnIds = Lists.newArrayList();
private LongWritable key;
private Text value;
private int numCols = 0;
private FileSplit split;
private long totalRecordsRead;
public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context,
char delimiter, List<SchemaPath> columns) {
this.delimiter = (byte) delimiter;
this.split = split;
setColumns(columns);
if (!isStarQuery()) {
String pathStr;
for (SchemaPath path : columns) {
assert path.getRootSegment().isNamed();
pathStr = path.getRootSegment().getPath();
Preconditions.checkArgument(pathStr.equals(COL_NAME) || (pathStr.equals("*") && path.getRootSegment().getChild() == null),
"Selected column(s) must have name 'columns' or must be plain '*'");
if (path.getRootSegment().getChild() != null) {
Preconditions.checkArgument(path.getRootSegment().getChild().isArray(), "Selected column must be an array index");
int index = path.getRootSegment().getChild().getArraySegment().getIndex();
columnIds.add(index);
}
}
Collections.sort(columnIds);
numCols = columnIds.size();
}
TextInputFormat inputFormat = new TextInputFormat();
JobConf job = new JobConf(fsConf);
job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
job.setInputFormat(inputFormat.getClass());
try {
reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
key = reader.createKey();
value = reader.createValue();
totalRecordsRead = 0;
} catch (Exception e) {
handleAndRaise("Failure in creating record reader", e);
}
}
@Override
protected List<SchemaPath> getDefaultColumnsToRead() {
return DEFAULT_TEXT_COLS_TO_READ;
}
@Override
public boolean isStarQuery() {
return super.isStarQuery() || Iterables.tryFind(getColumns(), new Predicate<SchemaPath>() {
private final SchemaPath COLUMNS = SchemaPath.getSimplePath("columns");
@Override
public boolean apply(@Nullable SchemaPath path) {
return path.equals(COLUMNS);
}
}).isPresent();
}
@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
MaterializedField field = MaterializedField.create(ref.getAsNamePart().getName(), Types.repeated(TypeProtos.MinorType.VARCHAR));
try {
vector = output.addField(field, RepeatedVarCharVector.class);
} catch (Exception e) {
handleAndRaise("Failure in setting up reader", e);
}
}
protected void handleAndRaise(String s, Exception e) {
String message = "Error in text record reader.\nMessage: " + s +
"\nSplit information:\n\tPath: " + split.getPath() +
"\n\tStart: " + split.getStart() +
"\n\tLength: " + split.getLength();
throw new DrillRuntimeException(message, e);
}
@Override
public int next() {
// logger.debug("vector value capacity {}", vector.getValueCapacity());
// logger.debug("vector byte capacity {}", vector.getByteCapacity());
int batchSize = 0;
try {
int recordCount = 0;
final RepeatedVarCharVector.Mutator mutator = vector.getMutator();
while (recordCount < Character.MAX_VALUE && batchSize < 200*1000 && reader.next(key, value)) {
int start;
int end = -1;
// index of the scanned field
int p = 0;
int i = 0;
mutator.startNewValue(recordCount);
// Process each field in this line
while (end < value.getLength() - 1) {
if(numCols > 0 && p >= numCols) {
break;
}
start = end;
if (delimiter == '\n') {
end = value.getLength();
} else {
end = find(value, delimiter, start + 1);
if (end == -1) {
end = value.getLength();
}
}
if (numCols > 0 && i++ < columnIds.get(p)) {
mutator.addSafe(recordCount, value.getBytes(), start + 1, 0);
continue;
}
p++;
mutator.addSafe(recordCount, value.getBytes(), start + 1, end - start - 1);
batchSize += end - start;
}
recordCount++;
totalRecordsRead++;
}
for (final ValueVector v : vectors) {
v.getMutator().setValueCount(recordCount);
}
mutator.setValueCount(recordCount);
// logger.debug("text scan batch size {}", batchSize);
return recordCount;
} catch(Exception e) {
close();
handleAndRaise("Failure while parsing text. Parser was at record: " + (totalRecordsRead + 1), e);
}
// this is never reached
return 0;
}
/**
* Returns the index within the text of the first occurrence of delimiter, starting the search at the specified index.
*
* @param text the text being searched
* @param delimiter the delimiter
* @param start the index to start searching
* @return the first occurrence of delimiter, starting the search at the specified index
*/
public int find(Text text, byte delimiter, int start) {
int len = text.getLength();
int p = start;
byte[] bytes = text.getBytes();
boolean inQuotes = false;
while (p < len) {
if ('\"' == bytes[p]) {
inQuotes = !inQuotes;
}
if (!inQuotes && bytes[p] == delimiter) {
return p;
}
p++;
}
return -1;
}
@Override
public void close() {
try {
if (reader != null) {
reader.close();
reader = null;
}
} catch (IOException e) {
logger.warn("Exception closing reader: {}", e);
}
}
}