/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.apps.helper;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Random;
import java.util.logging.Logger;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import at.tuwien.ifs.somtoolbox.apps.SOMToolboxApp;
import at.tuwien.ifs.somtoolbox.apps.config.AbstractOptionFactory;
import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory;
import at.tuwien.ifs.somtoolbox.data.InputData;
import at.tuwien.ifs.somtoolbox.data.InputDataWriter;
import at.tuwien.ifs.somtoolbox.data.InputDatum;
import at.tuwien.ifs.somtoolbox.data.SOMLibSparseInputData;
import at.tuwien.ifs.somtoolbox.data.SOMLibTemplateVector;
import at.tuwien.ifs.somtoolbox.data.TemplateVector;
import at.tuwien.ifs.somtoolbox.util.ArrayUtils;
import at.tuwien.ifs.somtoolbox.util.FileUtils;
/**
* Randomises a given data set (consisting of an {@link InputData} vector, and optionally a {@link TemplateVector}. The
* data set is randomised by randomly swapping the order of columns (attributes), and/or rows (vectors).
*
* @author Rudolf Mayer
* @version $Id: DatasetRandomiser.java 3668 2010-07-15 09:03:06Z frank $
*/
public class DatasetRandomiser implements SOMToolboxApp {
/** @see SOMToolboxApp */
public static final String DESCRIPTION = "Randomises data sets";
/** @see SOMToolboxApp */
public static final String LONG_DESCRIPTION = "Randomises data sets by swapping the order of columns (features/attributes) and/or rows (vectors)";
/** @see SOMToolboxApp */
public static final Parameter[] OPTIONS = { OptionFactory.getOptInputVectorFile(true),
OptionFactory.getOptTemplateVectorFile(false), OptionFactory.getOptNumberVariants(false, 1),
OptionFactory.getOptInterleave(false, 1), OptionFactory.getOptStartIndex(false, 1),
OptionFactory.getSwitchPreserveFeatureOrder(), OptionFactory.getSwitchPreserveVectorOrder(),
OptionFactory.getOptGZip(false, true), OptionFactory.getOptOutputFileName(true) };
public static final Type APPLICATION_TYPE = Type.Helper;
public static void main(String[] args) throws IOException {
JSAPResult options = AbstractOptionFactory.parseResults(args, OPTIONS);
String inputVectorFile = options.getString("inputVectorFile");
String templateVectorFile = options.getString("templateVectorFile");
String ouputFile = options.getString("output");
boolean preserveFeatureOrder = options.getBoolean("preserveFeatureOrder", false);
boolean preserveVectorOrder = options.getBoolean("preserveVectorOrder", false);
boolean gzip = options.getBoolean("gzip", false);
int numberVariations = options.getInt("variants");
int interleave = options.getInt("interleave");
int startIndex = options.getInt("startIndex");
InputData inputData = new SOMLibSparseInputData(inputVectorFile, templateVectorFile);
// randomly swap columns
int dim = inputData.dim();
int[] columnOrder = ArrayUtils.getLinearArray(dim);
int[] rowOrder = ArrayUtils.getLinearArray(inputData.numVectors());
for (int i = 0; i < numberVariations; i++) {
if (!preserveFeatureOrder) {
randomise(columnOrder);
}
if (!preserveVectorOrder) {
randomise(rowOrder);
}
String fileName = numberVariations == 1 ? ouputFile : ouputFile + "_" + (i * interleave + startIndex);
Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing dataset '" + fileName + "'.");
writeToFile(fileName, inputData, columnOrder, rowOrder, gzip);
}
}
private static void randomise(int[] array) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").fine("Original order: " + Arrays.toString(array));
Random random = new Random(7);
for (int i = 0; i < array.length * 2 / 3; i++) {
int source = random.nextInt(array.length);
int target = random.nextInt(array.length);
if (target == source) {
target = random.nextInt(array.length);
}
Logger.getLogger("at.tuwien.ifs.somtoolbox").finer(
"Swapping " + (i + 1) + "/" + array.length + ": " + source + " <==> " + target);
int temp = array[source];
array[source] = array[target];
array[target] = temp;
Logger.getLogger("at.tuwien.ifs.somtoolbox").finer(
"Intermediate randomised order: " + Arrays.toString(array));
}
Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Randomised order: " + ArrayUtils.toString(array, 100));
}
private static void writeToFile(String fileName, InputData inputData, int[] columnOrder, int[] rowOrder,
boolean gzip) throws IOException {
// write the randomised input file
PrintWriter writer = FileUtils.openFileForWriting("Input vector file", fileName + ".vec", gzip);
InputDataWriter.writeHeaderToFile(writer, inputData.numVectors(), inputData.dim());
for (int i = 0; i < inputData.numVectors(); i++) {
InputDatum inputDatum = inputData.getInputDatum(rowOrder[i]);
writeInputDatumToFile(writer, inputDatum, columnOrder);
}
writer.flush();
writer.close();
// write the randomised template vector file
writer = FileUtils.openFileForWriting("Template vector file", fileName + ".tv", false);
TemplateVector tv = inputData.templateVector();
InputDataWriter.writeTempplateHeaderToFile(writer, fileName, tv.numVectors(), tv.dim(), tv.numinfo());
for (int i = 0; i < columnOrder.length; i++) {
InputDataWriter.writeElementToFile(writer, i, tv.getElement(columnOrder[i]));
}
writer.flush();
writer.close();
// write the randomised template vector file, with generic names for the features
InputDataWriter.writeAsSOMLib(new SOMLibTemplateVector(inputData.numVectors(), inputData.dim()), fileName
+ ".generic.tv");
}
private static void writeInputDatumToFile(PrintWriter writer, InputDatum inputDatum, int[] columnOrder) {
for (int element : columnOrder) {
final double v = inputDatum.getVector().get(element);
if (!Double.isNaN(v)) {
writer.write(v + " ");
} else {
writer.write("? ");
}
}
writer.println(inputDatum.getLabel());
}
}