/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.loader;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedMap;
import java.util.Spliterator;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javax.inject.Inject;
import org.diqube.context.AutoInstatiate;
import org.diqube.data.column.ColumnType;
import org.diqube.data.column.StandardColumnShard;
import org.diqube.data.table.TableFactory;
import org.diqube.data.table.TableShard;
import org.diqube.loader.JsonLoader.Parser.Handler;
import org.diqube.loader.columnshard.ColumnShardBuilderFactory;
import org.diqube.loader.columnshard.ColumnShardBuilderManager;
import org.diqube.loader.util.ParallelLoadAndTransposeHelper;
import org.diqube.name.RepeatedColumnNameGenerator;
import org.diqube.threads.ExecutorManager;
import org.diqube.util.BigByteBuffer;
import org.diqube.util.HashingBatchCollector;
import org.diqube.util.exception.WrappingException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
/**
* {@link Loader} which loads data from JSON files.
*
* <p>
* This Loader can load hierarchical data.
*
* <p>
* The JSON file must consist of an array, which contains a complex object for each logical "row" in the resulting
* table. The complex object can in turn contain other complex objects (hierarchy) and can contain arrays itself
* (repeated fields). Currently, though, each top-level object needs to recursively be made up of the exactly same
* fields ("optional" fields are not supported yet).
*
* <p>
* This loader will return only one TableShard for a whole JSON input file.
*
* TODO #14 support optional fields.
*
* @author Bastian Gloeckle
*/
@AutoInstatiate
public class JsonLoader implements Loader {
public static final int BUCKET_SIZE = 1_000;
private static final Logger logger = LoggerFactory.getLogger(JsonLoader.class);
@Inject
private ColumnShardBuilderFactory columnShardBuilderManagerFactory;
@Inject
private TableFactory tableFactory;
@Inject
private ExecutorManager executorManager;
@Inject
private RepeatedColumnNameGenerator repeatedColNames;
@Override
public Collection<TableShard> load(long firstRowId, String filename, String tableName, LoaderColumnInfo columnInfo)
throws LoadException {
logger.info("Reading data for table '{}' from '{}'.", new Object[] { tableName, filename });
try (RandomAccessFile f = new RandomAccessFile(filename, "r")) {
BigByteBuffer buf = new BigByteBuffer(f.getChannel(), MapMode.READ_ONLY, b -> b.load());
return load(firstRowId, buf, tableName, columnInfo);
} catch (IOException e) {
throw new LoadException("Could not load " + filename, e);
}
}
@Override
public Collection<TableShard> load(long firstRowId, BigByteBuffer jsonBuffer, String tableName,
LoaderColumnInfo columnInfo) throws LoadException {
JsonFactory factory = new JsonFactory();
// parse the jsonBuffer and identify all columns, their types and repeated columns.
Map<String, ColumnType> columnTypes = new HashMap<>();
Set<String> repeatedCols = new HashSet<>();
NavigableMap<Long, Long> objectLocations = new TreeMap<>();
logger.info("Inspecting JSON in order to find all columns...");
findColumnInfo(factory, jsonBuffer, columnTypes, repeatedCols, objectLocations);
// TODO #15 validate that we did not identify different things throughout the same input file.
// put colTypes into columnInfo, throw exception if something different was identified than what was specified.
for (String colName : columnTypes.keySet()) {
// TODO #15 introduce import data specificity - when default is "double" but "long" identified, make it "double"
if (columnInfo.isDefaultDataType(colName))
columnInfo.registerColumnType(colName, columnTypes.get(colName));
else if (!columnInfo.getFinalColumnType(colName).equals(ColumnType.STRING) // overriding to STRING type is allowed
&& !columnInfo.getFinalColumnType(colName).equals(columnTypes.get(colName)))
throw new LoadException(
"Column '" + colName + "': Automatically identified type to be " + columnTypes.get(colName) + ", but "
+ columnInfo.getFinalColumnType(colName) + " was specified. This is invalid.");
}
logger.info("Found {} columns.", columnTypes.size());
Stream<Parser> stream = StreamSupport.stream(
new JsonSpliterator(objectLocations, //
factory, //
repeatedColNames, //
jsonBuffer, //
objectLocations.firstKey(), //
objectLocations.lastEntry().getValue() + 1), //
true);
ColumnShardBuilderManager columnBuilderManager =
columnShardBuilderManagerFactory.createColumnShardBuilderManager(columnInfo, firstRowId);
String[] colNames = columnTypes.keySet().stream().toArray(l -> new String[l]);
Map<String, Integer> colToColIndex = new HashMap<>();
for (int i = 0; i < colNames.length; i++)
colToColIndex.put(colNames[i], i);
ParallelLoadAndTransposeHelper transposer =
new ParallelLoadAndTransposeHelper(executorManager, columnInfo, columnBuilderManager, colNames, tableName);
logger.info("Loading data and transforming to temporary columnar representation...");
try {
transposer.transpose(firstRowId, new Consumer<ConcurrentLinkedDeque<String[][]>>() {
@Override
public void accept(ConcurrentLinkedDeque<String[][]> rowWiseTarget) {
stream.parallel().map(new Function<Parser, String[]>() {
@Override
public String[] apply(Parser parser) {
try {
return parseOneEntry(parser, colNames, repeatedCols, colToColIndex);
} catch (LoadException e) {
throw new WrappingException(e);
}
}
}).collect(new HashingBatchCollector<String[]>(BUCKET_SIZE, //
(len) -> new String[len][], //
a -> {
rowWiseTarget.add(a);
logger.trace("Providing new batch with {} rows", a.length);
})
);
}
});
} catch (WrappingException e) {
LoadException loadEx = (LoadException) e.getWrappedException();
throw loadEx;
}
logger.info("Read data for table {}. Compressing and creating final representation...", tableName);
// set "0" default value for the length columns
for (String repeatedColName : repeatedCols)
columnBuilderManager.fillEmptyRowsWithValue(repeatedColNames.repeatedLength(repeatedColName), 0L);
// Build the columns.
List<StandardColumnShard> columns = new LinkedList<>();
for (String colName : columnBuilderManager.getAllColumnsWithValues()) {
StandardColumnShard columnShard = columnBuilderManager.buildAndFree(colName);
columns.add(columnShard);
}
logger.info("Columns for new table shard of table {} created, creating TableShard...", tableName);
TableShard tableShard = tableFactory.createDefaultTableShard(tableName, columns);
logger.info(
"Table shard for new table shard of table {} created successfully, it contains {} rows starting from rowId {}",
tableName, tableShard.getNumberOfRowsInShard(), tableShard.getLowestRowId());
return Arrays.asList(tableShard);
}
/**
* Parses one top level object in the JSON and produces a corresponding String[].
*
* TODO support other than String[].
*
* @param parser
* The {@link Parser} that is prepared to load the top level input object.
* @param colNames
* Names of the columns. The result of this method will be an array where the index in the array denotes the
* column for which the entry contains the value - It is the same ordering as this colNames parameter.
* @param repeatedCols
* Set of repeated columns.
* @param colToColIndex
* Map from column name to the index in colNames.
* @return String[] containing the values of the object for each column. Ordering is the same as colNames.
*/
private String[] parseOneEntry(Parser parser, String[] colNames, Set<String> repeatedCols,
Map<String, Integer> colToColIndex) throws LoadException {
String[] res = new String[colNames.length];
parser.parse(new Handler() {
@Override
public boolean isArray(String colName) {
return repeatedCols.contains(colName);
}
@Override
public void valueString(String colName, JsonParser parser) throws LoadException {
try {
res[colToColIndex.get(colName)] = parser.getValueAsString().intern();
} catch (IOException e) {
throw new LoadException("Could not parse value of column " + colName + ": " + e.getMessage(), e);
}
}
@Override
public void valueLong(String colName, JsonParser parser) throws LoadException {
try {
res[colToColIndex.get(colName)] = Long.valueOf(parser.getValueAsLong()).toString().intern();
} catch (IOException e) {
throw new LoadException("Could not parse value of column " + colName + ": " + e.getMessage(), e);
}
}
@Override
public void valueDouble(String colName, JsonParser parser) throws LoadException {
try {
res[colToColIndex.get(colName)] = Double.valueOf(parser.getValueAsDouble()).toString().intern();
} catch (IOException e) {
throw new LoadException("Could not parse value of column " + colName + ": " + e.getMessage(), e);
}
}
@Override
public void endArray(String colName, int length) throws LoadException {
String lengthCol = repeatedColNames.repeatedLength(colName);
res[colToColIndex.get(lengthCol)] = Integer.toString(length);
}
});
return res;
}
/**
* Parse the jsonBuffer and find information about all the columns used in the input.
*
* @param factory
* The factory which can be passed to {@link Parser}.
* @param jsonBuffer
* The buffer containing the raw JSON.
* @param resColumnTypes
* Result (this object will be filled!): The data type for all found columns (key: column name).
* @param resRepeatedCols
* Result (this object will be filled!): The column names of the columns which are repeated (=arrays).
* @param resObjectPositions
* Result (this object will be filled!): A Map that identifies the byte-locations of top level objects in the
* input stream. The key is the index a top level object begins, the value is where it ends. As the
* "top level objects" (=those that will be used as rows for our new table) are contained in a standard JSON
* array and there might be some whitespace between the objects, there might be gaps. When cutting the
* jsonBuffer according to these values, each chunk will be parseable as a single top level object (= a
* single row in the new table).
*/
private void findColumnInfo(JsonFactory factory, BigByteBuffer jsonBuffer, Map<String, ColumnType> resColumnTypes,
Set<String> resRepeatedCols, NavigableMap<Long, Long> resObjectPositions) throws LoadException {
new Parser(factory, repeatedColNames, jsonBuffer.createInputStream()).parse(new Handler() {
@Override
public boolean isArray(String colName) {
return resRepeatedCols.contains(colName);
}
@Override
public void startArray(String colName) {
resRepeatedCols.add(colName);
// length column.
resColumnTypes.put(repeatedColNames.repeatedLength(colName), ColumnType.LONG);
}
@Override
public void topLevelObjectStart(Long pos) {
resObjectPositions.put(pos, null);
}
@Override
public void topLevelObjectEnd(Long pos) {
resObjectPositions.put(resObjectPositions.lastKey(), pos);
}
@Override
public void valueString(String colName, JsonParser parser) {
resColumnTypes.put(colName, ColumnType.STRING);
}
@Override
public void valueLong(String colName, JsonParser parser) {
if (!resColumnTypes.containsKey(colName)) // ignore LONG if there is a DOUBLE already.
resColumnTypes.put(colName, ColumnType.LONG);
}
@Override
public void valueDouble(String colName, JsonParser parser) {
resColumnTypes.put(colName, ColumnType.DOUBLE);
}
});
}
/**
* Encapsulates parsing of a JSON and creation of valid column names. A {@link Handler} is called at interesting
* places to do actual work.
*/
public static class Parser {
private JsonFactory factory;
private InputStream jsonStream;
private RepeatedColumnNameGenerator repeatedColNames;
public Parser(JsonFactory factory, RepeatedColumnNameGenerator repeatedColNames, InputStream jsonStream) {
this.factory = factory;
this.repeatedColNames = repeatedColNames;
this.jsonStream = jsonStream;
}
/**
* Parse the JSON stream, create correct column names and call the {@link Handler} to handle those.
*/
public void parse(Handler handler) throws LoadException {
JsonParser parser = null;
try {
parser = factory.createParser(jsonStream);
Deque<String> colNameStack = new LinkedList<>();
boolean directlyInArray = false;
Deque<Integer> nextArrayIndexStack = new LinkedList<>();
JsonToken token;
while ((token = parser.nextToken()) != null) {
switch (token) {
case START_ARRAY:
if (directlyInArray)
throw new LoadException("Cannot load JSON because it contains a multi-dimensional array.");
if (!colNameStack.isEmpty()) {
directlyInArray = true;
nextArrayIndexStack.add(0); // first index in array is 0
handler.startArray(colNameStack.getLast());
}
break;
case END_ARRAY:
Integer length = nextArrayIndexStack.pollLast();
directlyInArray = false; // no multi-dimensional arrays, therefore we are not in an array any more.
String arrayName = colNameStack.pollLast(); // was pushed by FIELD_NAME. in the non-array case this is
// polled by VALUE_*.
if (arrayName != null && !arrayName.equals(""))
handler.endArray(arrayName, length);
break;
case START_OBJECT:
if (directlyInArray) {
int idx = nextArrayIndexStack.pollLast();
String colName = repeatedColNames.repeatedAtIndex(colNameStack.getLast(), idx);
colNameStack.add(colName);
nextArrayIndexStack.add(idx + 1);
directlyInArray = false;
}
if (colNameStack.isEmpty())
handler.topLevelObjectStart(parser.getCurrentLocation().getByteOffset() - 1);
break;
case END_OBJECT:
if (colNameStack.isEmpty())
handler.topLevelObjectEnd(parser.getCurrentLocation().getByteOffset());
else if (colNameStack.size() >= 2) {
// peek to the element before the last in colNameStack - if that is an array, then we need to poll away
// the element that we added in START_OBJECT.
Iterator<String> it = colNameStack.descendingIterator();
it.next();
if (handler.isArray(it.next())) {
directlyInArray = true;
colNameStack.pollLast();
}
}
break;
case FIELD_NAME:
if (colNameStack.isEmpty())
colNameStack.add(parser.getText());
else
colNameStack.add(colNameStack.getLast() + "." + parser.getText());
break;
case VALUE_STRING:
if (colNameStack.isEmpty())
throw new LoadException("Ensure that the outer array contains JSON objects and not values directly.");
if (directlyInArray) {
int idx = nextArrayIndexStack.pollLast();
String colName = repeatedColNames.repeatedAtIndex(colNameStack.getLast(), idx);
handler.valueString(colName, parser);
nextArrayIndexStack.add(idx + 1);
} else
handler.valueString(colNameStack.pollLast(), parser);
break;
case VALUE_NUMBER_INT:
if (colNameStack.isEmpty())
throw new LoadException("Ensure that the outer array contains JSON objects and not values directly.");
if (directlyInArray) {
int idx = nextArrayIndexStack.pollLast();
String colName = repeatedColNames.repeatedAtIndex(colNameStack.getLast(), idx);
handler.valueLong(colName, parser);
nextArrayIndexStack.add(idx + 1);
} else
handler.valueLong(colNameStack.pollLast(), parser);
break;
case VALUE_NUMBER_FLOAT:
if (colNameStack.isEmpty())
throw new LoadException("Ensure that the outer array contains JSON objects and not values directly.");
if (directlyInArray) {
int idx = nextArrayIndexStack.pollLast();
String colName = repeatedColNames.repeatedAtIndex(colNameStack.getLast(), idx);
handler.valueDouble(colName, parser);
nextArrayIndexStack.add(idx + 1);
} else
handler.valueDouble(colNameStack.pollLast(), parser);
break;
case VALUE_TRUE:
case VALUE_FALSE:
if (colNameStack.isEmpty())
throw new LoadException("Ensure that the outer array contains JSON objects and not values directly.");
if (directlyInArray) {
int idx = nextArrayIndexStack.pollLast();
String colName = repeatedColNames.repeatedAtIndex(colNameStack.getLast(), idx);
handler.valueLong(colName, parser);
nextArrayIndexStack.add(idx + 1);
} else
handler.valueLong(colNameStack.pollLast(), parser);
break;
case VALUE_NULL:
// TODO #14 support null
break;
case NOT_AVAILABLE:
case VALUE_EMBEDDED_OBJECT:
// noop.
break;
}
}
} catch (IOException e) {
throw new LoadException("Could not parse column names from JSON: " + e.getMessage(), e);
} finally {
if (parser != null)
try {
parser.close();
} catch (IOException e) {
// swallow.
}
}
}
/**
* Externally specified Handler for handling events while parsing.
*/
public static abstract class Handler {
/**
* Has to return <code>true</code> if a colName is specified which is known to be a repeatable field/array.
*
* If the column is actually repeatable, then this method will be called after corresponding calls to
* {@link #startArray(String)}.
*/
abstract public boolean isArray(String colName) throws LoadException;
/**
* The start of an array with the given colName was parsed.
*
* This will not be called for the outermost array, as that isn't of interest to us.
*/
public void startArray(String colName) throws LoadException {
}
/**
* The end of an array including the number of elements that were identified.
*/
public void endArray(String colName, int length) throws LoadException {
}
/**
* A new top level object starts at the given byte index.
*/
public void topLevelObjectStart(Long pos) throws LoadException {
}
/**
* A new top level object ends at the given byte index. This will be called after the corresponding call to
* {@link #topLevelObjectStart(Long)} - there won't be any intermediary calls to
* {@link #topLevelObjectStart(Long)}.
*/
public void topLevelObjectEnd(Long pos) throws LoadException {
}
/**
* A String type value column has been found. If one needs to resolve the actual value, use the provided
* {@link JsonParser}.
*/
public void valueString(String colName, JsonParser parser) throws LoadException {
}
/**
* A Long type value column has been found. If one needs to resolve the actual value, use the provided
* {@link JsonParser}.
*/
public void valueLong(String colName, JsonParser parser) throws LoadException {
}
/**
* A Double type value column has been found. If one needs to resolve the actual value, use the provided
* {@link JsonParser}.
*/
public void valueDouble(String colName, JsonParser parser) throws LoadException {
}
}
}
/**
* A {@link Spliterator} on a JSON input which can be used to construct a parallel {@link Stream} on an input JSON.
*
* <p>
* This spliterator is based on the result of
* {@link JsonLoader#findColumnInfo(JsonFactory, ByteBuffer, Map, Set, NavigableMap)} -> the object location map.
*/
public static class JsonSpliterator implements Spliterator<Parser> {
private NavigableMap<Long, Long> objectLocations;
private Long startPosInclusive;
private Long endPosExclusive;
private JsonFactory factory;
private BigByteBuffer jsonBuffer;
private RepeatedColumnNameGenerator repeatedColNames;
/**
*
* @param objectLocations
* The object locations as identified as output of
* {@link JsonLoader#findColumnInfo(JsonFactory, ByteBuffer, Map, Set, NavigableMap)}. Each Entry in the
* map specifies the byte-indices of one top level object (which will be parsed into one table row). The
* key of the entry specifieds the first byte index in the JSON buffer of the object, the value specifies
* the last index.
* @param factory
* The {@link JsonFactory} that can be used by the {@link Parser}.
* @param jsonBuffer
* The raw JSON input.
* @param startPosInclusive
* First index of a valid top level object in the buffer that this spliterator should cover (e.g. the
* firstKey() of objectLocations).
* @param endPosExclusive
* An index in the buffer that is the first one that this spliterator should not cover any more.
*/
public JsonSpliterator(NavigableMap<Long, Long> objectLocations, JsonFactory factory,
RepeatedColumnNameGenerator repeatedColNames, BigByteBuffer jsonBuffer, Long startPosInclusive,
Long endPosExclusive) {
this.objectLocations = objectLocations;
this.repeatedColNames = repeatedColNames;
this.startPosInclusive = startPosInclusive;
this.endPosExclusive = endPosExclusive;
this.factory = factory;
this.jsonBuffer = jsonBuffer;
}
@Override
public boolean tryAdvance(Consumer<? super Parser> action) {
if (startPosInclusive + 1 >= endPosExclusive)
return false;
long topLevelObjectStart = startPosInclusive;
long topLevelObjectEnd = objectLocations.get(startPosInclusive);
action.accept(new Parser(factory, repeatedColNames,
jsonBuffer.createPartialInputStream(topLevelObjectStart, topLevelObjectEnd)));
startPosInclusive = objectLocations.ceilingKey(startPosInclusive + 1);
if (startPosInclusive == null)
startPosInclusive = endPosExclusive;
return true;
}
@Override
public Spliterator<Parser> trySplit() {
SortedMap<Long, Long> subMap = objectLocations.subMap(startPosInclusive, endPosExclusive);
if (subMap.size() <= 2)
return null;
Long middle = subMap.firstKey() + ((subMap.lastKey() - subMap.firstKey()) / 2);
Long middleKey = objectLocations.ceilingKey(middle);
if (middleKey == null || middleKey >= endPosExclusive || middleKey == startPosInclusive)
return null;
if (objectLocations.subMap(middleKey, endPosExclusive).isEmpty())
return null;
JsonSpliterator newSplit =
new JsonSpliterator(objectLocations, factory, repeatedColNames, jsonBuffer, middleKey, endPosExclusive);
endPosExclusive = middleKey;
return newSplit;
}
@Override
public long estimateSize() {
return objectLocations.subMap(startPosInclusive, endPosExclusive).size();
}
@Override
public int characteristics() {
return Spliterator.DISTINCT | Spliterator.SIZED | Spliterator.NONNULL | Spliterator.IMMUTABLE
| Spliterator.SUBSIZED;
}
}
}