/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.vector.complex.fn; import io.netty.buffer.DrillBuf; import java.io.IOException; import java.io.InputStream; import java.util.BitSet; import java.util.List; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.expression.PathSegment; import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.exec.physical.base.GroupScan; import org.apache.drill.exec.store.easy.json.reader.BaseJsonProcessor; import org.apache.drill.exec.vector.complex.fn.VectorOutput.ListVectorOutput; import org.apache.drill.exec.vector.complex.fn.VectorOutput.MapVectorOutput; import org.apache.drill.exec.vector.complex.writer.BaseWriter; import org.apache.drill.exec.vector.complex.writer.BaseWriter.ComplexWriter; import org.apache.drill.exec.vector.complex.writer.BaseWriter.ListWriter; import org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.base.Charsets; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; public class JsonReader extends BaseJsonProcessor { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory .getLogger(JsonReader.class); public final static int MAX_RECORD_SIZE = 128 * 1024; private final WorkingBuffer workingBuffer; private final List<SchemaPath> columns; private final boolean allTextMode; private final MapVectorOutput mapOutput; private final ListVectorOutput listOutput; private final boolean extended = true; private final boolean readNumbersAsDouble; /** * Collection for tracking empty array writers during reading * and storing them for initializing empty arrays */ private final List<ListWriter> emptyArrayWriters = Lists.newArrayList(); /** * Describes whether or not this reader can unwrap a single root array record * and treat it like a set of distinct records. */ private final boolean skipOuterList; /** * Whether the reader is currently in a situation where we are unwrapping an * outer list. */ private boolean inOuterList; /** * The name of the current field being parsed. For Error messages. */ private String currentFieldName; private FieldSelection selection; public JsonReader(DrillBuf managedBuf, boolean allTextMode, boolean skipOuterList, boolean readNumbersAsDouble) { this(managedBuf, GroupScan.ALL_COLUMNS, allTextMode, skipOuterList, readNumbersAsDouble); } public JsonReader(DrillBuf managedBuf, List<SchemaPath> columns, boolean allTextMode, boolean skipOuterList, boolean readNumbersAsDouble) { super(managedBuf); assert Preconditions.checkNotNull(columns).size() > 0 : "JSON record reader requires at least one column"; this.selection = FieldSelection.getFieldSelection(columns); this.workingBuffer = new WorkingBuffer(managedBuf); this.skipOuterList = skipOuterList; this.allTextMode = allTextMode; this.columns = columns; this.mapOutput = new MapVectorOutput(workingBuffer); this.listOutput = new ListVectorOutput(workingBuffer); this.currentFieldName = "<none>"; this.readNumbersAsDouble = readNumbersAsDouble; } @SuppressWarnings("resource") @Override public void ensureAtLeastOneField(ComplexWriter writer) { List<BaseWriter.MapWriter> writerList = Lists.newArrayList(); List<PathSegment> fieldPathList = Lists.newArrayList(); BitSet emptyStatus = new BitSet(columns.size()); // first pass: collect which fields are empty for (int i = 0; i < columns.size(); i++) { SchemaPath sp = columns.get(i); PathSegment fieldPath = sp.getRootSegment(); BaseWriter.MapWriter fieldWriter = writer.rootAsMap(); while (fieldPath.getChild() != null && !fieldPath.getChild().isArray()) { fieldWriter = fieldWriter.map(fieldPath.getNameSegment().getPath()); fieldPath = fieldPath.getChild(); } writerList.add(fieldWriter); fieldPathList.add(fieldPath); if (fieldWriter.isEmptyMap()) { emptyStatus.set(i, true); } if (i == 0 && !allTextMode) { // when allTextMode is false, there is not much benefit to producing all // the empty // fields; just produce 1 field. The reason is that the type of the // fields is // unknown, so if we produce multiple Integer fields by default, a // subsequent batch // that contains non-integer fields will error out in any case. Whereas, // with // allTextMode true, we are sure that all fields are going to be treated // as varchar, // so it makes sense to produce all the fields, and in fact is necessary // in order to // avoid schema change exceptions by downstream operators. break; } } // second pass: create default typed vectors corresponding to empty fields // Note: this is not easily do-able in 1 pass because the same fieldWriter // may be // shared by multiple fields whereas we want to keep track of all fields // independently, // so we rely on the emptyStatus. for (int j = 0; j < fieldPathList.size(); j++) { BaseWriter.MapWriter fieldWriter = writerList.get(j); PathSegment fieldPath = fieldPathList.get(j); if (emptyStatus.get(j)) { if (allTextMode) { fieldWriter.varChar(fieldPath.getNameSegment().getPath()); } else { fieldWriter.integer(fieldPath.getNameSegment().getPath()); } } } for (ListWriter field : emptyArrayWriters) { // checks that array has not been initialized if (field.getValueCapacity() == 0) { if (allTextMode) { field.varChar(); } else { field.integer(); } } } } public void setSource(int start, int end, DrillBuf buf) throws IOException { setSource(DrillBufInputStream.getStream(start, end, buf)); } @Override public void setSource(InputStream is) throws IOException { super.setSource(is); mapOutput.setParser(parser); listOutput.setParser(parser); } @Override public void setSource(JsonNode node) { super.setSource(node); mapOutput.setParser(parser); listOutput.setParser(parser); } public void setSource(String data) throws IOException { setSource(data.getBytes(Charsets.UTF_8)); } @SuppressWarnings("resource") public void setSource(byte[] bytes) throws IOException { setSource(new SeekableBAIS(bytes)); } @Override public ReadState write(ComplexWriter writer) throws IOException { ReadState readState = null; try { JsonToken t = lastSeenJsonToken; if (t == null || t == JsonToken.END_OBJECT) { t = parser.nextToken(); } while (!parser.hasCurrentToken() && !parser.isClosed()) { t = parser.nextToken(); } lastSeenJsonToken = null; if (parser.isClosed()) { return ReadState.END_OF_STREAM; } readState = writeToVector(writer, t); switch (readState) { case END_OF_STREAM: break; case WRITE_SUCCEED: break; default: throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null).message( "Failure while reading JSON. (Got an invalid read state %s )", readState.toString()).build(logger); } } catch (com.fasterxml.jackson.core.JsonParseException ex) { if (ignoreJSONParseError()) { if (processJSONException() == JsonExceptionProcessingState.END_OF_STREAM) { return ReadState.JSON_RECORD_PARSE_EOF_ERROR; } else { return ReadState.JSON_RECORD_PARSE_ERROR; } } else { throw ex; } } return readState; } private void confirmLast() throws IOException { parser.nextToken(); if (!parser.isClosed()) { throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null) .message( "Drill attempted to unwrap a toplevel list " + "in your document. However, it appears that there is trailing content after this top level list. Drill only " + "supports querying a set of distinct maps or a single json array with multiple inner maps.") .build(logger); } } private ReadState writeToVector(ComplexWriter writer, JsonToken t) throws IOException { switch (t) { case START_OBJECT: writeDataSwitch(writer.rootAsMap()); break; case START_ARRAY: if (inOuterList) { throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null) .message( "The top level of your document must either be a single array of maps or a set " + "of white space delimited maps.").build(logger); } if (skipOuterList) { t = parser.nextToken(); if (t == JsonToken.START_OBJECT) { inOuterList = true; writeDataSwitch(writer.rootAsMap()); } else { throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null) .message( "The top level of your document must either be a single array of maps or a set " + "of white space delimited maps.").build(logger); } } else { writeDataSwitch(writer.rootAsList()); } break; case END_ARRAY: if (inOuterList) { confirmLast(); return ReadState.END_OF_STREAM; } else { throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null).message( "Failure while parsing JSON. Ran across unexpected %s.", JsonToken.END_ARRAY).build(logger); } case NOT_AVAILABLE: return ReadState.END_OF_STREAM; default: throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null) .message( "Failure while parsing JSON. Found token of [%s]. Drill currently only supports parsing " + "json strings that contain either lists or maps. The root object cannot be a scalar.", t).build(logger); } return ReadState.WRITE_SUCCEED; } private void writeDataSwitch(MapWriter w) throws IOException { if (this.allTextMode) { writeDataAllText(w, this.selection, true); } else { writeData(w, this.selection, true); } } private void writeDataSwitch(ListWriter w) throws IOException { if (this.allTextMode) { writeDataAllText(w); } else { writeData(w); } } private void consumeEntireNextValue() throws IOException { switch (parser.nextToken()) { case START_ARRAY: case START_OBJECT: parser.skipChildren(); return; default: // hit a single value, do nothing as the token was already read // in the switch statement return; } } /** * * @param map * @param selection * @param moveForward * Whether or not we should start with using the current token or the * next token. If moveForward = true, we should start with the next * token and ignore the current one. * @throws IOException */ private void writeData(MapWriter map, FieldSelection selection, boolean moveForward) throws IOException { // map.start(); try { outside: while (true) { JsonToken t; if (moveForward) { t = parser.nextToken(); } else { t = parser.getCurrentToken(); moveForward = true; } if (t == JsonToken.NOT_AVAILABLE || t == JsonToken.END_OBJECT) { return; } assert t == JsonToken.FIELD_NAME : String.format( "Expected FIELD_NAME but got %s.", t.name()); final String fieldName = parser.getText(); this.currentFieldName = fieldName; FieldSelection childSelection = selection.getChild(fieldName); if (childSelection.isNeverValid()) { consumeEntireNextValue(); continue outside; } switch (parser.nextToken()) { case START_ARRAY: writeData(map.list(fieldName)); break; case START_OBJECT: if (!writeMapDataIfTyped(map, fieldName)) { writeData(map.map(fieldName), childSelection, false); } break; case END_OBJECT: break outside; case VALUE_FALSE: { map.bit(fieldName).writeBit(0); break; } case VALUE_TRUE: { map.bit(fieldName).writeBit(1); break; } case VALUE_NULL: // do nothing as we don't have a type. break; case VALUE_NUMBER_FLOAT: map.float8(fieldName).writeFloat8(parser.getDoubleValue()); break; case VALUE_NUMBER_INT: if (this.readNumbersAsDouble) { map.float8(fieldName).writeFloat8(parser.getDoubleValue()); } else { map.bigInt(fieldName).writeBigInt(parser.getLongValue()); } break; case VALUE_STRING: handleString(parser, map, fieldName); break; default: throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null).message("Unexpected token %s", parser.getCurrentToken()).build(logger); } } } finally { map.end(); } } private void writeDataAllText(MapWriter map, FieldSelection selection, boolean moveForward) throws IOException { // map.start(); outside: while (true) { JsonToken t; if (moveForward) { t = parser.nextToken(); } else { t = parser.getCurrentToken(); moveForward = true; } if (t == JsonToken.NOT_AVAILABLE || t == JsonToken.END_OBJECT) { return; } assert t == JsonToken.FIELD_NAME : String.format( "Expected FIELD_NAME but got %s.", t.name()); final String fieldName = parser.getText(); this.currentFieldName = fieldName; FieldSelection childSelection = selection.getChild(fieldName); if (childSelection.isNeverValid()) { consumeEntireNextValue(); continue outside; } switch (parser.nextToken()) { case START_ARRAY: writeDataAllText(map.list(fieldName)); break; case START_OBJECT: if (!writeMapDataIfTyped(map, fieldName)) { writeDataAllText(map.map(fieldName), childSelection, false); } break; case END_OBJECT: break outside; case VALUE_EMBEDDED_OBJECT: case VALUE_FALSE: case VALUE_TRUE: case VALUE_NUMBER_FLOAT: case VALUE_NUMBER_INT: case VALUE_STRING: handleString(parser, map, fieldName); break; case VALUE_NULL: // do nothing as we don't have a type. break; default: throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null).message("Unexpected token %s", parser.getCurrentToken()).build(logger); } } map.end(); } /** * Will attempt to take the current value and consume it as an extended value * (if extended mode is enabled). Whether extended is enable or disabled, will * consume the next token in the stream. * @param writer * @param fieldName * @return * @throws IOException */ private boolean writeMapDataIfTyped(MapWriter writer, String fieldName) throws IOException { if (extended) { return mapOutput.run(writer, fieldName); } else { parser.nextToken(); return false; } } /** * Will attempt to take the current value and consume it as an extended value * (if extended mode is enabled). Whether extended is enable or disabled, will * consume the next token in the stream. * @param writer * @return * @throws IOException */ private boolean writeListDataIfTyped(ListWriter writer) throws IOException { if (extended) { return listOutput.run(writer); } else { parser.nextToken(); return false; } } private void handleString(JsonParser parser, MapWriter writer, String fieldName) throws IOException { writer.varChar(fieldName).writeVarChar(0, workingBuffer.prepareVarCharHolder(parser.getText()), workingBuffer.getBuf()); } private void handleString(JsonParser parser, ListWriter writer) throws IOException { writer.varChar().writeVarChar(0, workingBuffer.prepareVarCharHolder(parser.getText()), workingBuffer.getBuf()); } private void writeData(ListWriter list) throws IOException { list.startList(); outside: while (true) { try { switch (parser.nextToken()) { case START_ARRAY: writeData(list.list()); break; case START_OBJECT: if (!writeListDataIfTyped(list)) { writeData(list.map(), FieldSelection.ALL_VALID, false); } break; case END_ARRAY: addIfNotInitialized(list); case END_OBJECT: break outside; case VALUE_EMBEDDED_OBJECT: case VALUE_FALSE: { list.bit().writeBit(0); break; } case VALUE_TRUE: { list.bit().writeBit(1); break; } case VALUE_NULL: throw UserException .unsupportedError() .message( "Null values are not supported in lists by default. " + "Please set `store.json.all_text_mode` to true to read lists containing nulls. " + "Be advised that this will treat JSON null values as a string containing the word 'null'.") .build(logger); case VALUE_NUMBER_FLOAT: list.float8().writeFloat8(parser.getDoubleValue()); break; case VALUE_NUMBER_INT: if (this.readNumbersAsDouble) { list.float8().writeFloat8(parser.getDoubleValue()); } else { list.bigInt().writeBigInt(parser.getLongValue()); } break; case VALUE_STRING: handleString(parser, list); break; default: throw UserException.dataReadError() .message("Unexpected token %s", parser.getCurrentToken()) .build(logger); } } catch (Exception e) { throw getExceptionWithContext(e, this.currentFieldName, null).build( logger); } } list.endList(); } /** * Checks that list has not been initialized and adds it to the emptyArrayWriters collection. * @param list ListWriter that should be checked */ private void addIfNotInitialized(ListWriter list) { if (list.getValueCapacity() == 0) { emptyArrayWriters.add(list); } } private void writeDataAllText(ListWriter list) throws IOException { list.startList(); outside: while (true) { switch (parser.nextToken()) { case START_ARRAY: writeDataAllText(list.list()); break; case START_OBJECT: if (!writeListDataIfTyped(list)) { writeDataAllText(list.map(), FieldSelection.ALL_VALID, false); } break; case END_ARRAY: addIfNotInitialized(list); case END_OBJECT: break outside; case VALUE_EMBEDDED_OBJECT: case VALUE_FALSE: case VALUE_TRUE: case VALUE_NULL: case VALUE_NUMBER_FLOAT: case VALUE_NUMBER_INT: case VALUE_STRING: handleString(parser, list); break; default: throw getExceptionWithContext(UserException.dataReadError(), currentFieldName, null).message("Unexpected token %s", parser.getCurrentToken()).build(logger); } } list.endList(); } public DrillBuf getWorkBuf() { return workingBuffer.getBuf(); } }