/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.vector.complex.fn;
import io.netty.buffer.DrillBuf;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.List;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.expression.PathSegment;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.exec.physical.base.GroupScan;
import org.apache.drill.exec.store.easy.json.reader.BaseJsonProcessor;
import org.apache.drill.exec.vector.complex.fn.VectorOutput.ListVectorOutput;
import org.apache.drill.exec.vector.complex.fn.VectorOutput.MapVectorOutput;
import org.apache.drill.exec.vector.complex.writer.BaseWriter;
import org.apache.drill.exec.vector.complex.writer.BaseWriter.ComplexWriter;
import org.apache.drill.exec.vector.complex.writer.BaseWriter.ListWriter;
import org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
public class JsonReader extends BaseJsonProcessor {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory
.getLogger(JsonReader.class);
public final static int MAX_RECORD_SIZE = 128 * 1024;
private final WorkingBuffer workingBuffer;
private final List<SchemaPath> columns;
private final boolean allTextMode;
private final MapVectorOutput mapOutput;
private final ListVectorOutput listOutput;
private final boolean extended = true;
private final boolean readNumbersAsDouble;
/**
* Collection for tracking empty array writers during reading
* and storing them for initializing empty arrays
*/
private final List<ListWriter> emptyArrayWriters = Lists.newArrayList();
/**
* Describes whether or not this reader can unwrap a single root array record
* and treat it like a set of distinct records.
*/
private final boolean skipOuterList;
/**
* Whether the reader is currently in a situation where we are unwrapping an
* outer list.
*/
private boolean inOuterList;
/**
* The name of the current field being parsed. For Error messages.
*/
private String currentFieldName;
private FieldSelection selection;
public JsonReader(DrillBuf managedBuf, boolean allTextMode,
boolean skipOuterList, boolean readNumbersAsDouble) {
this(managedBuf, GroupScan.ALL_COLUMNS, allTextMode, skipOuterList,
readNumbersAsDouble);
}
public JsonReader(DrillBuf managedBuf, List<SchemaPath> columns,
boolean allTextMode, boolean skipOuterList, boolean readNumbersAsDouble) {
super(managedBuf);
assert Preconditions.checkNotNull(columns).size() > 0 : "JSON record reader requires at least one column";
this.selection = FieldSelection.getFieldSelection(columns);
this.workingBuffer = new WorkingBuffer(managedBuf);
this.skipOuterList = skipOuterList;
this.allTextMode = allTextMode;
this.columns = columns;
this.mapOutput = new MapVectorOutput(workingBuffer);
this.listOutput = new ListVectorOutput(workingBuffer);
this.currentFieldName = "<none>";
this.readNumbersAsDouble = readNumbersAsDouble;
}
@SuppressWarnings("resource")
@Override
public void ensureAtLeastOneField(ComplexWriter writer) {
List<BaseWriter.MapWriter> writerList = Lists.newArrayList();
List<PathSegment> fieldPathList = Lists.newArrayList();
BitSet emptyStatus = new BitSet(columns.size());
// first pass: collect which fields are empty
for (int i = 0; i < columns.size(); i++) {
SchemaPath sp = columns.get(i);
PathSegment fieldPath = sp.getRootSegment();
BaseWriter.MapWriter fieldWriter = writer.rootAsMap();
while (fieldPath.getChild() != null && !fieldPath.getChild().isArray()) {
fieldWriter = fieldWriter.map(fieldPath.getNameSegment().getPath());
fieldPath = fieldPath.getChild();
}
writerList.add(fieldWriter);
fieldPathList.add(fieldPath);
if (fieldWriter.isEmptyMap()) {
emptyStatus.set(i, true);
}
if (i == 0 && !allTextMode) {
// when allTextMode is false, there is not much benefit to producing all
// the empty
// fields; just produce 1 field. The reason is that the type of the
// fields is
// unknown, so if we produce multiple Integer fields by default, a
// subsequent batch
// that contains non-integer fields will error out in any case. Whereas,
// with
// allTextMode true, we are sure that all fields are going to be treated
// as varchar,
// so it makes sense to produce all the fields, and in fact is necessary
// in order to
// avoid schema change exceptions by downstream operators.
break;
}
}
// second pass: create default typed vectors corresponding to empty fields
// Note: this is not easily do-able in 1 pass because the same fieldWriter
// may be
// shared by multiple fields whereas we want to keep track of all fields
// independently,
// so we rely on the emptyStatus.
for (int j = 0; j < fieldPathList.size(); j++) {
BaseWriter.MapWriter fieldWriter = writerList.get(j);
PathSegment fieldPath = fieldPathList.get(j);
if (emptyStatus.get(j)) {
if (allTextMode) {
fieldWriter.varChar(fieldPath.getNameSegment().getPath());
} else {
fieldWriter.integer(fieldPath.getNameSegment().getPath());
}
}
}
for (ListWriter field : emptyArrayWriters) {
// checks that array has not been initialized
if (field.getValueCapacity() == 0) {
if (allTextMode) {
field.varChar();
} else {
field.integer();
}
}
}
}
public void setSource(int start, int end, DrillBuf buf) throws IOException {
setSource(DrillBufInputStream.getStream(start, end, buf));
}
@Override
public void setSource(InputStream is) throws IOException {
super.setSource(is);
mapOutput.setParser(parser);
listOutput.setParser(parser);
}
@Override
public void setSource(JsonNode node) {
super.setSource(node);
mapOutput.setParser(parser);
listOutput.setParser(parser);
}
public void setSource(String data) throws IOException {
setSource(data.getBytes(Charsets.UTF_8));
}
@SuppressWarnings("resource")
public void setSource(byte[] bytes) throws IOException {
setSource(new SeekableBAIS(bytes));
}
@Override
public ReadState write(ComplexWriter writer) throws IOException {
ReadState readState = null;
try {
JsonToken t = lastSeenJsonToken;
if (t == null || t == JsonToken.END_OBJECT) {
t = parser.nextToken();
}
while (!parser.hasCurrentToken() && !parser.isClosed()) {
t = parser.nextToken();
}
lastSeenJsonToken = null;
if (parser.isClosed()) {
return ReadState.END_OF_STREAM;
}
readState = writeToVector(writer, t);
switch (readState) {
case END_OF_STREAM:
break;
case WRITE_SUCCEED:
break;
default:
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null).message(
"Failure while reading JSON. (Got an invalid read state %s )",
readState.toString()).build(logger);
}
} catch (com.fasterxml.jackson.core.JsonParseException ex) {
if (ignoreJSONParseError()) {
if (processJSONException() == JsonExceptionProcessingState.END_OF_STREAM) {
return ReadState.JSON_RECORD_PARSE_EOF_ERROR;
} else {
return ReadState.JSON_RECORD_PARSE_ERROR;
}
} else {
throw ex;
}
}
return readState;
}
private void confirmLast() throws IOException {
parser.nextToken();
if (!parser.isClosed()) {
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null)
.message(
"Drill attempted to unwrap a toplevel list "
+ "in your document. However, it appears that there is trailing content after this top level list. Drill only "
+ "supports querying a set of distinct maps or a single json array with multiple inner maps.")
.build(logger);
}
}
private ReadState writeToVector(ComplexWriter writer, JsonToken t)
throws IOException {
switch (t) {
case START_OBJECT:
writeDataSwitch(writer.rootAsMap());
break;
case START_ARRAY:
if (inOuterList) {
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null)
.message(
"The top level of your document must either be a single array of maps or a set "
+ "of white space delimited maps.").build(logger);
}
if (skipOuterList) {
t = parser.nextToken();
if (t == JsonToken.START_OBJECT) {
inOuterList = true;
writeDataSwitch(writer.rootAsMap());
} else {
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null)
.message(
"The top level of your document must either be a single array of maps or a set "
+ "of white space delimited maps.").build(logger);
}
} else {
writeDataSwitch(writer.rootAsList());
}
break;
case END_ARRAY:
if (inOuterList) {
confirmLast();
return ReadState.END_OF_STREAM;
} else {
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null).message(
"Failure while parsing JSON. Ran across unexpected %s.",
JsonToken.END_ARRAY).build(logger);
}
case NOT_AVAILABLE:
return ReadState.END_OF_STREAM;
default:
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null)
.message(
"Failure while parsing JSON. Found token of [%s]. Drill currently only supports parsing "
+ "json strings that contain either lists or maps. The root object cannot be a scalar.",
t).build(logger);
}
return ReadState.WRITE_SUCCEED;
}
private void writeDataSwitch(MapWriter w) throws IOException {
if (this.allTextMode) {
writeDataAllText(w, this.selection, true);
} else {
writeData(w, this.selection, true);
}
}
private void writeDataSwitch(ListWriter w) throws IOException {
if (this.allTextMode) {
writeDataAllText(w);
} else {
writeData(w);
}
}
private void consumeEntireNextValue() throws IOException {
switch (parser.nextToken()) {
case START_ARRAY:
case START_OBJECT:
parser.skipChildren();
return;
default:
// hit a single value, do nothing as the token was already read
// in the switch statement
return;
}
}
/**
*
* @param map
* @param selection
* @param moveForward
* Whether or not we should start with using the current token or the
* next token. If moveForward = true, we should start with the next
* token and ignore the current one.
* @throws IOException
*/
private void writeData(MapWriter map, FieldSelection selection,
boolean moveForward) throws IOException {
//
map.start();
try {
outside: while (true) {
JsonToken t;
if (moveForward) {
t = parser.nextToken();
} else {
t = parser.getCurrentToken();
moveForward = true;
}
if (t == JsonToken.NOT_AVAILABLE || t == JsonToken.END_OBJECT) {
return;
}
assert t == JsonToken.FIELD_NAME : String.format(
"Expected FIELD_NAME but got %s.", t.name());
final String fieldName = parser.getText();
this.currentFieldName = fieldName;
FieldSelection childSelection = selection.getChild(fieldName);
if (childSelection.isNeverValid()) {
consumeEntireNextValue();
continue outside;
}
switch (parser.nextToken()) {
case START_ARRAY:
writeData(map.list(fieldName));
break;
case START_OBJECT:
if (!writeMapDataIfTyped(map, fieldName)) {
writeData(map.map(fieldName), childSelection, false);
}
break;
case END_OBJECT:
break outside;
case VALUE_FALSE: {
map.bit(fieldName).writeBit(0);
break;
}
case VALUE_TRUE: {
map.bit(fieldName).writeBit(1);
break;
}
case VALUE_NULL:
// do nothing as we don't have a type.
break;
case VALUE_NUMBER_FLOAT:
map.float8(fieldName).writeFloat8(parser.getDoubleValue());
break;
case VALUE_NUMBER_INT:
if (this.readNumbersAsDouble) {
map.float8(fieldName).writeFloat8(parser.getDoubleValue());
} else {
map.bigInt(fieldName).writeBigInt(parser.getLongValue());
}
break;
case VALUE_STRING:
handleString(parser, map, fieldName);
break;
default:
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null).message("Unexpected token %s",
parser.getCurrentToken()).build(logger);
}
}
} finally {
map.end();
}
}
private void writeDataAllText(MapWriter map, FieldSelection selection,
boolean moveForward) throws IOException {
//
map.start();
outside: while (true) {
JsonToken t;
if (moveForward) {
t = parser.nextToken();
} else {
t = parser.getCurrentToken();
moveForward = true;
}
if (t == JsonToken.NOT_AVAILABLE || t == JsonToken.END_OBJECT) {
return;
}
assert t == JsonToken.FIELD_NAME : String.format(
"Expected FIELD_NAME but got %s.", t.name());
final String fieldName = parser.getText();
this.currentFieldName = fieldName;
FieldSelection childSelection = selection.getChild(fieldName);
if (childSelection.isNeverValid()) {
consumeEntireNextValue();
continue outside;
}
switch (parser.nextToken()) {
case START_ARRAY:
writeDataAllText(map.list(fieldName));
break;
case START_OBJECT:
if (!writeMapDataIfTyped(map, fieldName)) {
writeDataAllText(map.map(fieldName), childSelection, false);
}
break;
case END_OBJECT:
break outside;
case VALUE_EMBEDDED_OBJECT:
case VALUE_FALSE:
case VALUE_TRUE:
case VALUE_NUMBER_FLOAT:
case VALUE_NUMBER_INT:
case VALUE_STRING:
handleString(parser, map, fieldName);
break;
case VALUE_NULL:
// do nothing as we don't have a type.
break;
default:
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null).message("Unexpected token %s",
parser.getCurrentToken()).build(logger);
}
}
map.end();
}
/**
* Will attempt to take the current value and consume it as an extended value
* (if extended mode is enabled). Whether extended is enable or disabled, will
* consume the next token in the stream.
* @param writer
* @param fieldName
* @return
* @throws IOException
*/
private boolean writeMapDataIfTyped(MapWriter writer, String fieldName)
throws IOException {
if (extended) {
return mapOutput.run(writer, fieldName);
} else {
parser.nextToken();
return false;
}
}
/**
* Will attempt to take the current value and consume it as an extended value
* (if extended mode is enabled). Whether extended is enable or disabled, will
* consume the next token in the stream.
* @param writer
* @return
* @throws IOException
*/
private boolean writeListDataIfTyped(ListWriter writer) throws IOException {
if (extended) {
return listOutput.run(writer);
} else {
parser.nextToken();
return false;
}
}
private void handleString(JsonParser parser, MapWriter writer,
String fieldName) throws IOException {
writer.varChar(fieldName).writeVarChar(0,
workingBuffer.prepareVarCharHolder(parser.getText()),
workingBuffer.getBuf());
}
private void handleString(JsonParser parser, ListWriter writer)
throws IOException {
writer.varChar().writeVarChar(0,
workingBuffer.prepareVarCharHolder(parser.getText()),
workingBuffer.getBuf());
}
private void writeData(ListWriter list) throws IOException {
list.startList();
outside: while (true) {
try {
switch (parser.nextToken()) {
case START_ARRAY:
writeData(list.list());
break;
case START_OBJECT:
if (!writeListDataIfTyped(list)) {
writeData(list.map(), FieldSelection.ALL_VALID, false);
}
break;
case END_ARRAY:
addIfNotInitialized(list);
case END_OBJECT:
break outside;
case VALUE_EMBEDDED_OBJECT:
case VALUE_FALSE: {
list.bit().writeBit(0);
break;
}
case VALUE_TRUE: {
list.bit().writeBit(1);
break;
}
case VALUE_NULL:
throw UserException
.unsupportedError()
.message(
"Null values are not supported in lists by default. "
+ "Please set `store.json.all_text_mode` to true to read lists containing nulls. "
+ "Be advised that this will treat JSON null values as a string containing the word 'null'.")
.build(logger);
case VALUE_NUMBER_FLOAT:
list.float8().writeFloat8(parser.getDoubleValue());
break;
case VALUE_NUMBER_INT:
if (this.readNumbersAsDouble) {
list.float8().writeFloat8(parser.getDoubleValue());
} else {
list.bigInt().writeBigInt(parser.getLongValue());
}
break;
case VALUE_STRING:
handleString(parser, list);
break;
default:
throw UserException.dataReadError()
.message("Unexpected token %s", parser.getCurrentToken())
.build(logger);
}
} catch (Exception e) {
throw getExceptionWithContext(e, this.currentFieldName, null).build(
logger);
}
}
list.endList();
}
/**
* Checks that list has not been initialized and adds it to the emptyArrayWriters collection.
* @param list ListWriter that should be checked
*/
private void addIfNotInitialized(ListWriter list) {
if (list.getValueCapacity() == 0) {
emptyArrayWriters.add(list);
}
}
private void writeDataAllText(ListWriter list) throws IOException {
list.startList();
outside: while (true) {
switch (parser.nextToken()) {
case START_ARRAY:
writeDataAllText(list.list());
break;
case START_OBJECT:
if (!writeListDataIfTyped(list)) {
writeDataAllText(list.map(), FieldSelection.ALL_VALID, false);
}
break;
case END_ARRAY:
addIfNotInitialized(list);
case END_OBJECT:
break outside;
case VALUE_EMBEDDED_OBJECT:
case VALUE_FALSE:
case VALUE_TRUE:
case VALUE_NULL:
case VALUE_NUMBER_FLOAT:
case VALUE_NUMBER_INT:
case VALUE_STRING:
handleString(parser, list);
break;
default:
throw getExceptionWithContext(UserException.dataReadError(),
currentFieldName, null).message("Unexpected token %s",
parser.getCurrentToken()).build(logger);
}
}
list.endList();
}
public DrillBuf getWorkBuf() {
return workingBuffer.getBuf();
}
}