/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.loader; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel.MapMode; import java.util.Arrays; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.Spliterator; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.function.Consumer; import java.util.stream.Stream; import java.util.stream.StreamSupport; import javax.inject.Inject; import org.diqube.context.AutoInstatiate; import org.diqube.data.column.StandardColumnShard; import org.diqube.data.table.TableFactory; import org.diqube.data.table.TableShard; import org.diqube.loader.columnshard.ColumnShardBuilderFactory; import org.diqube.loader.columnshard.ColumnShardBuilderManager; import org.diqube.loader.util.ParallelLoadAndTransposeHelper; import org.diqube.threads.ExecutorManager; import org.diqube.util.BigByteBuffer; import org.diqube.util.HashingBatchCollector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.opencsv.CSVParser; /** * Simple {@link Loader} that loads CSV files. * * <p> * This loader does not support hierarchical data. * * <p> * This loader will return only one TableShard for a whole CSV input file. * * @author Bastian Gloeckle */ @AutoInstatiate public class CsvLoader implements Loader { private static final Logger logger = LoggerFactory.getLogger(CsvLoader.class); /** * The rows of the CSV are parsed and loaded into memory in a batched format. Each batch/buffer contains approx. this * amount of entries. */ private static final int COLUMN_BUFFER_SIZE = 1_000; @Inject private ColumnShardBuilderFactory columnShardBuilderManagerFactory; @Inject private TableFactory tableFactory; @Inject private ExecutorManager executorManager; @Override public Collection<TableShard> load(long firstRowId, String filename, String tableName, LoaderColumnInfo columnInfo) throws LoadException { ColumnShardBuilderManager columnManager; logger.info("Reading data for new table '{}' from '{}'.", new Object[] { tableName, filename }); try (RandomAccessFile f = new RandomAccessFile(filename, "r")) { BigByteBuffer buf = new BigByteBuffer(f.getChannel(), MapMode.READ_ONLY, b -> b.load()); columnManager = readColumnData(firstRowId, buf, tableName, columnInfo); // close file as soon as possible and free the ByteBuffer. buf = null; } catch (IOException e) { throw new LoadException("Could not load " + filename, e); } return createTableShard(columnManager, tableName); } @Override public Collection<TableShard> load(long firstRowId, BigByteBuffer csvBuffer, String tableName, LoaderColumnInfo columnInfo) throws LoadException { ColumnShardBuilderManager columnManager = readColumnData(firstRowId, csvBuffer, tableName, columnInfo); return createTableShard(columnManager, tableName); } /** * Reads all data from the CSV that is provided in a {@link ByteBuffer} and returns a * {@link ColumnShardBuilderManager} that is ready for building the columns. * * @param firstRowId * The first rowId to be used. * @param buf * The input buffer, containing CSV data. * @param tableName * The name of the resulting table. * @param columnInfo * Information about each column that this CSV contains. * * @return A {@link ColumnShardBuilderManager} that has all the data of all the columns of the CSV already added to * it. It is ready for building the columns using {@link ColumnShardBuilderManager#buildAndFree(String)}. * @throws LoadException * If something cannot be loaded. */ private ColumnShardBuilderManager readColumnData(long firstRowId, BigByteBuffer buf, String tableName, LoaderColumnInfo columnInfo) throws LoadException { String[] header; ColumnShardBuilderManager columnBuilderManager = columnShardBuilderManagerFactory.createColumnShardBuilderManager(columnInfo, firstRowId); // Read CSV Header to learn of the columns that we need to import. int numChars = 0; while (numChars < buf.size() && buf.get(numChars) != '\n') numChars++; if (numChars >= buf.size()) throw new LoadException("Could not identify CSV header."); byte[] b = new byte[numChars]; buf.get(0, b, 0, numChars); try { header = new CSVParser().parseLine(new String(b)); } catch (IOException e) { throw new LoadException("Could not parse CSV header.", e); } // TODO #16 do auto-recognition of data types of columns (or make it explicitly "enable/disable" in .control file). // TODO #17 validate column names logger.info("New table '{}' contains {} columns, reading columnar data.", new Object[] { tableName, header.length }); // Initialize the input stream. Stream<String> stream = StreamSupport.stream(new LineSpliterator(buf, numChars + 1, buf.size(), numChars, 1), true); ParallelLoadAndTransposeHelper transposer = new ParallelLoadAndTransposeHelper(executorManager, columnInfo, columnBuilderManager, header, tableName); transposer.transpose(firstRowId, new Consumer<ConcurrentLinkedDeque<String[][]>>() { @Override public void accept(ConcurrentLinkedDeque<String[][]> rowWiseTarget) { // Start parsing CSV lines in parallel, bucketing the results into the rowWiseTarget deque from where they // will be fetched by the transposer. // Arrays are non-colliding, so using HashingBatchCollector is fine. stream.parallel().map(CsvLoader::parseCsvLine) .collect(new HashingBatchCollector<String[]>( // COLUMN_BUFFER_SIZE, // Try to make buckets of this size (len) -> new String[len][], // Factory implementation on how to create a new result object. a -> rowWiseTarget.add(a)) // When there is a new result, put it into csvLines. ); } }); return columnBuilderManager; } /** * Takes a fully filled {@link ColumnShardBuilderManager} and creates a {@link TableShard} out of it. * * @param columnManager * The {@link ColumnShardBuilderManager} that has the data of all columns to be created already filled in. * @param tableName * Name of the result table. * @return The created {@link TableShard}. */ private Collection<TableShard> createTableShard(ColumnShardBuilderManager columnManager, String tableName) { logger.info("Read data for new table shard for table {}. Compressing and creating final representation...", tableName); // Build the columns. List<StandardColumnShard> columns = new LinkedList<>(); for (String colName : columnManager.getAllColumnsWithValues()) { StandardColumnShard columnShard = columnManager.buildAndFree(colName); columns.add(columnShard); } logger.info("Columns for new table shard of table {} created, creating table shard...", tableName); TableShard tableShard = tableFactory.createDefaultTableShard(tableName, columns); logger.info("Table shard for table {} created successfully.", tableName); return Arrays.asList(tableShard); } /** * Helper method that CSV-parses a single line. * * @param line * The input line * @return Parsed String values. */ public static String[] parseCsvLine(String line) { try { return new CSVParser().parseLine(line); } catch (Exception e) { throw new RuntimeException("Could not parse CSV.", e); } } /** * A {@link Spliterator} that splits an input {@link ByteBuffer} by line-ends (\n character). * * <p> * This Spliterator reports that the data is immutable, so the input ByteBuffer must not be changed while the * Spliterator is active. * * @author Bastian Gloeckle */ private static class LineSpliterator implements Spliterator<String> { private BigByteBuffer buf; private long startPos; private long maxPos; private long sumLineLength; private long sumLines; public LineSpliterator(BigByteBuffer buf, long startPos, long maxPos, long sumLineLength, long sumLines) { this.buf = buf; this.startPos = startPos; this.maxPos = maxPos; } @Override public boolean tryAdvance(Consumer<? super String> action) { long pos = startPos; while (pos < maxPos && buf.get(pos) != '\n') pos++; if (pos >= maxPos) // ignore last not-full line, as it might have been truncated. return false; if (pos - startPos > Integer.MAX_VALUE) throw new RuntimeException("Cannot load CSV because there's a line that is bigger than 2GB."); byte[] b = new byte[(int) (pos - startPos)]; for (int j = 0; j < b.length; j++) b[j] = buf.get(j + startPos); action.accept(new String(b)); startPos = pos + 1; sumLineLength += b.length; sumLines++; return true; } @Override public Spliterator<String> trySplit() { long middle = startPos + ((maxPos - startPos) >> 1); while (middle < maxPos && buf.get(middle) != '\n') middle++; if (middle >= maxPos) return null; LineSpliterator newSplit = new LineSpliterator(buf, middle + 1, maxPos, sumLineLength, sumLines); maxPos = middle; return newSplit; } @Override public long estimateSize() { return (long) ((maxPos - startPos) / ((double) sumLineLength / sumLines)); } @Override public int characteristics() { return Spliterator.IMMUTABLE | Spliterator.NONNULL; } } }