/*
* This file is part of ELKI:
* Environment for Developing KDD-Applications Supported by Index-Structures
*
* Copyright (C) 2017
* ELKI Development Team
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.lmu.ifi.dbs.elki.datasource;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource.Event;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser;
import de.lmu.ifi.dbs.elki.datasource.parser.Parser;
import de.lmu.ifi.dbs.elki.datasource.parser.StreamingParser;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.io.FileUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.FileListParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.FileListParameter.FilesType;
/**
* Database that will loading multiple files, concatenating the results.
*
* @author Erich Schubert
* @since 0.5.0
*/
public class ConcatenateFilesDatabaseConnection extends AbstractDatabaseConnection {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(ConcatenateFilesDatabaseConnection.class);
/**
* Input file list.
*/
private List<File> files;
/**
* The parser.
*/
private Parser parser;
/**
* Constructor.
*
* @param files Input files
* @param parser Parser
* @param filters Filters
*/
public ConcatenateFilesDatabaseConnection(List<File> files, Parser parser, List<ObjectFilter> filters) {
super(filters);
this.files = files;
this.parser = parser;
}
@Override
public MultipleObjectsBundle loadData() {
MultipleObjectsBundle objects = new MultipleObjectsBundle();
objects.appendColumn(TypeUtil.STRING, new ArrayList<>());
for(File file : files) {
String filestr = file.getPath();
try (InputStream inputStream = FileUtil.tryGzipInput(//
new BufferedInputStream(new FileInputStream(file)))) {
final BundleStreamSource source;
if(parser instanceof StreamingParser) {
final StreamingParser streamParser = (StreamingParser) parser;
streamParser.initStream(inputStream);
source = streamParser;
}
else {
MultipleObjectsBundle parsingResult = parser.parse(inputStream);
// normalize objects and transform labels
source = parsingResult.asStream();
}
BundleMeta meta = null; // NullPointerException on invalid streams
loop: for(Event e = source.nextEvent();; e = source.nextEvent()) {
switch(e){
case END_OF_STREAM:
break loop;
case META_CHANGED:
meta = source.getMeta();
for(int i = 0; i < meta.size(); i++) {
if(i + 1 >= objects.metaLength()) {
objects.appendColumn(meta.get(i), new ArrayList<>());
}
else {
// Ensure compatibility:
if(!objects.meta(i + 1).isAssignableFromType(meta.get(i))) {
throw new AbortException("Incompatible files loaded. Cannot concatenate with unaligned columns, please preprocess manually.");
}
}
}
break; // switch
case NEXT_OBJECT:
Object[] o = new Object[objects.metaLength()];
o[0] = filestr;
for(int i = 0; i < meta.size(); i++) {
o[i + 1] = source.data(i);
}
objects.appendSimple(o);
break; // switch
}
}
}
catch(IOException e) {
throw new AbortException("Loading file " + filestr + " failed: " + e.toString(), e);
}
}
parser.cleanup();
// Invoke filters
if(LOG.isDebugging()) {
LOG.debugFine("Invoking filters.");
}
return invokeBundleFilters(objects);
}
@Override
protected Logging getLogger() {
return LOG;
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractDatabaseConnection.Parameterizer {
/**
* The input files.
*/
private List<File> files;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
FileListParameter filesP = new FileListParameter(FileBasedDatabaseConnection.Parameterizer.INPUT_ID, FilesType.INPUT_FILES);
if(config.grab(filesP)) {
files = filesP.getValue();
}
configFilters(config);
configParser(config, Parser.class, NumberVectorLabelParser.class);
}
@Override
protected ConcatenateFilesDatabaseConnection makeInstance() {
return new ConcatenateFilesDatabaseConnection(files, parser, filters);
}
}
}