/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.tool.transpose;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import org.diqube.context.Profiles;
import org.diqube.data.column.ColumnType;
import org.diqube.data.serialize.DataSerialization;
import org.diqube.data.serialize.DataSerializer.ObjectDoneConsumer;
import org.diqube.data.serialize.SerializationException;
import org.diqube.data.table.TableShard;
import org.diqube.file.DiqubeFileFactory;
import org.diqube.file.DiqubeFileWriter;
import org.diqube.loader.LoadException;
import org.diqube.loader.Loader;
import org.diqube.loader.LoaderColumnInfo;
import org.diqube.util.NullUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
/**
* Implements transposing an input file into .diqube representation.
*
* @author Bastian Gloeckle
*/
public class TransposeImplementation {
private static final Logger logger = LoggerFactory.getLogger(TransposeImplementation.class);
private static final String TABLE_NAME = "TransposeImportTable";
private File inputFile;
private File outputFile;
private File colInfoFile;
private Class<? extends Loader> loaderClass;
/**
* @param colInfoFile
* can be <code>null</code>.
*/
public TransposeImplementation(File inputFile, File outputFile, File colInfoFile,
Class<? extends Loader> loaderClass) {
this.inputFile = inputFile;
this.outputFile = outputFile;
this.colInfoFile = colInfoFile;
this.loaderClass = loaderClass;
}
public void transpose() {
logger.info("Starting diqube context...");
try (AnnotationConfigApplicationContext ctx = new AnnotationConfigApplicationContext()) {
ctx.getEnvironment().setActiveProfiles(Profiles.CONFIG, Profiles.TOOL);
ctx.scan("org.diqube");
ctx.refresh();
DiqubeFileFactory fileFactory = ctx.getBean(DiqubeFileFactory.class);
Loader loader = ctx.getBean(loaderClass);
// for JSON it does not mater what we use here, as the JsonLoader will detect the col type automatically.
LoaderColumnInfo colInfo = new LoaderColumnInfo(ColumnType.LONG);
if (colInfoFile != null)
colInfo = loadColInfo(colInfoFile);
else
logger.info("Using column info with default column type Long.");
try (FileOutputStream outStream = new FileOutputStream(outputFile)) {
logger.info("Starting to load data into temporary in-memory table '{}'", TABLE_NAME);
// loader is either CSV or JSON, both return a single TableShard element!
TableShard tableShard =
Iterables.getOnlyElement(loader.load(0L, inputFile.getAbsolutePath(), TABLE_NAME, colInfo));
logger.info("Data loaded into in-memory table '{}', starting to serialize that data into output file '{}'",
TABLE_NAME, outputFile.getAbsolutePath());
try (DiqubeFileWriter writer = fileFactory.createDiqubeFileWriter(outStream)) {
writer.writeTableShard(tableShard, new ObjectDoneConsumer() {
@Override
public void accept(DataSerialization<?> t) {
// right after we're done with serializing an object, we "null" all its properties to try to free up some
// memory.
NullUtil.setAllPropertiesToNull(t,
// just log on exception, it is not as bad if a field cannot be nulled.
(fieldToNull, e) -> logger.trace("Could not null {} on {}", fieldToNull, e));
}
});
}
logger.info("Successfully serialized data to '{}'", outputFile.getAbsolutePath());
} catch (IOException | LoadException | SerializationException e) {
logger.error("Could not proceed.", e);
return;
}
}
logger.info("Done.");
}
private LoaderColumnInfo loadColInfo(File colInfoFile) throws RuntimeException {
Properties prop = new Properties();
try (InputStream is = new FileInputStream(colInfoFile)) {
prop.load(new InputStreamReader(is, Charset.forName("UTF-8")));
logger.info("Loading column type info from '{}'", colInfoFile.getAbsolutePath());
Map<String, ColumnType> colTypes = new HashMap<>();
ColumnType defaultColType = ColumnType.LONG;
Iterator<Object> it = Iterators.forEnumeration(prop.keys());
while (it.hasNext()) {
String colName = (String) it.next();
if ("*".equals(colName))
defaultColType = resolveColumnType(prop.getProperty(colName));
else
colTypes.put(colName, resolveColumnType(prop.getProperty(colName)));
}
LoaderColumnInfo res = new LoaderColumnInfo(defaultColType);
for (Entry<String, ColumnType> e : colTypes.entrySet())
res.registerColumnType(e.getKey(), e.getValue());
logger.info("Using column information with default column type '{}' and specific column types: {}",
defaultColType, colTypes);
return res;
} catch (IOException e) {
throw new RuntimeException("Could not read Column Info file", e);
}
}
private ColumnType resolveColumnType(String controlFileString) throws RuntimeException {
try {
return ColumnType.valueOf(controlFileString.toUpperCase());
} catch (RuntimeException e) {
throw new RuntimeException(controlFileString + " is no valid ColumnType.");
}
}
}