/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.apps.helper; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Map; import cern.colt.matrix.DoubleMatrix1D; import cern.colt.matrix.impl.SparseDoubleMatrix1D; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import at.tuwien.ifs.somtoolbox.apps.SOMToolboxApp; import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory; import at.tuwien.ifs.somtoolbox.data.InputData; import at.tuwien.ifs.somtoolbox.data.InputDataWriter; import at.tuwien.ifs.somtoolbox.data.InputDatum; import at.tuwien.ifs.somtoolbox.data.SOMLibClassInformation; import at.tuwien.ifs.somtoolbox.data.SOMLibSparseInputData; import at.tuwien.ifs.somtoolbox.data.SOMLibTemplateVector; import at.tuwien.ifs.somtoolbox.data.TemplateVector; import at.tuwien.ifs.somtoolbox.input.SOMLibFileFormatException; import at.tuwien.ifs.somtoolbox.properties.PropertiesException; import at.tuwien.ifs.somtoolbox.properties.SOMProperties; import at.tuwien.ifs.somtoolbox.util.ElementCounter; import at.tuwien.ifs.somtoolbox.util.FileUtils; import at.tuwien.ifs.somtoolbox.util.VectorTools; /** * Merges two or more SOMLib Input files, i.e. vector and template files. Template vectors can be off different * dimensionality, and may contain different features, but some features may also be overlapping. Different merge * strategies are available: union of all features sets, intersection of features sets, and strategies in between, * retaining a feature if it appears in at least x sets. This class is a bit similar to {@link VectorFileMerger}, but * more generic in the number of inputs it can take, and in the merging strategies, though it doesn't support the * weighting strategies implemented in {@link VectorFileMerger}. * * @author Rudolf Mayer * @version $Id: SOMLibInputMerger.java 3956 2010-11-29 15:33:45Z frank $ */ // FIXME: merge this with VectorFileMerger at some point public class SOMLibInputMerger implements SOMToolboxApp { public static final Parameter[] OPTIONS = new Parameter[] { OptionFactory.getOptMergeMode(), OptionFactory.getOptInputDirectory(false), OptionFactory.getOptOutputFileName(true), OptionFactory.getOptSOMLibInputs(false) }; public static final String DESCRIPTION = "Merges two or more SOMLib Input files, i.e. vector and template files"; public static final String LONG_DESCRIPTION = DESCRIPTION + ". Template vectors can be off different dimensionality, and may contain different features, but some features may also be overlapping." + " Different merge strategies are available: union of all features sets, intersection of features sets, and strategies in between, retaining a feature if it appears in at least x sets"; public static final Type APPLICATION_TYPE = Type.Helper; enum mergeMode { Union, Intersection, MinOccurence, All } public static void main(String[] args) throws IOException, SOMLibFileFormatException { JSAPResult config = OptionFactory.parseResults(args, OPTIONS); String[] inputTvs = FileUtils.findAllSOMLibFiles(config, "inputs", "inputDir", ".tv", ".vec"); String[] inputVecs = inputTvs; mergeVectors(inputTvs, inputVecs, config.getFile("output").getAbsolutePath(), config.getString("mode"), null); } public static void mergeVectors(final String[] inputTvs, final String[] inputVecs, String outFile, String modeString, int[] size) throws IOException, SOMLibFileFormatException { int percentage = -1; mergeMode mode = mergeMode.All; if (mode != null) { if (modeString.equalsIgnoreCase(mergeMode.Intersection.toString())) { mode = mergeMode.Intersection; } else if (modeString.equalsIgnoreCase(mergeMode.Union.toString())) { mode = mergeMode.Union; } else if (modeString.equalsIgnoreCase(mergeMode.All.toString())) { } else { try { percentage = Integer.parseInt(modeString); mode = mergeMode.MinOccurence; } catch (NumberFormatException e) { System.out.println("Illegal mode '" + modeString + "'. Aborting"); throw new IllegalArgumentException("Illegal mode '" + modeString + "'."); } } } if (mode == mergeMode.All) { merge(inputTvs, inputVecs, outFile + mergeMode.Union, percentage, mergeMode.Union, size); for (int i = 2; i < inputVecs.length; i++) { merge(inputTvs, inputVecs, outFile + mergeMode.MinOccurence + i, i, mergeMode.MinOccurence, size); } merge(inputTvs, inputVecs, outFile + mergeMode.Intersection, percentage, mergeMode.Intersection, size); } else { merge(inputTvs, inputVecs, outFile, percentage, mode, size); } } private static void merge(final String[] inputTvs, final String[] inputVecs, String outFile, int percentage, mergeMode mode, int[] size) throws IOException, SOMLibFileFormatException { System.out.println("\n\nStarting feature merging, mode: " + mode + (mode == mergeMode.MinOccurence ? ", min occurrence: " + percentage : "") + "\n\n"); if (outFile.contains(File.separator)) { // create output dir new File(outFile).getParentFile().mkdirs(); } TemplateVector[] tvs = new TemplateVector[inputTvs.length]; int totalFeatureCount = 0; int totalVectorCount = 0; // process template vectors Collection<String> mergedFeatures = new LinkedHashSet<String>(); LinkedHashSet<String> allFeatures = new LinkedHashSet<String>(); ElementCounter<String> counter = new ElementCounter<String>(); for (int i = 0; i < inputTvs.length; i++) { System.out.println("processing: " + inputTvs[i]); if (!FileUtils.extractSOMLibDataPrefix(inputTvs[i]).equals(inputTvs[i])) { inputTvs[i] = FileUtils.extractSOMLibDataPrefix(inputTvs[i]); System.out.println("\t=> extracted prefix "); } System.out.println("processing: " + inputTvs[i] + ".tv"); tvs[i] = new SOMLibTemplateVector(inputTvs[i] + ".tv"); totalVectorCount += tvs[i].numVectors(); final ArrayList<String> labels = tvs[i].getLabelsAsList(); if (mode == mergeMode.Intersection) { if (i == 0) {// in the first iteration of an intersection, we have to add all features mergedFeatures.addAll(labels); } // System.out.println("\n\n\nintersection mode, retaining"); mergedFeatures.retainAll(labels); } else if (mode == mergeMode.Union) { // System.out.println("\n\n\nunion mode, adding"); mergedFeatures.addAll(labels); } else { for (String label : labels) { counter.incCount(label); } } // System.out.println("\n\nSize of mergedFeatures in iteration " + i + ": " + mergedFeatures.size()); allFeatures.addAll(labels); totalFeatureCount += labels.size(); } if (mode == mergeMode.MinOccurence) { mergedFeatures.addAll(counter.keyList(percentage)); } mergedFeatures = new ArrayList<String>(mergedFeatures); Collections.sort((ArrayList<String>) mergedFeatures); System.out.println("\n\n"); System.out.println("==================================================="); System.out.println("feature stats:"); System.out.println("Total features in TemplateVectors: " + totalFeatureCount); System.out.println("Unique features: " + allFeatures.size()); System.out.println("Merged features: " + mergedFeatures.size()); System.out.println("Number of vectors: " + totalVectorCount); System.out.println(); System.out.println("==================================================="); // If we merge files where the TVs don't have the same number of vectors as the input vectors, // e.g. if those were generated from transforming a SOM's weight-vectors to an input vector file we need to // re-calculate the number of vectors totalVectorCount = 0; for (String inputVec : inputVecs) { final Map<String, String> headers = FileUtils.readSOMLibFileHeaders( FileUtils.openFile("Input Vector File", inputVec + InputData.inputFileNameSuffix), "input vector"); totalVectorCount += Integer.parseInt(headers.get("$XDIM")); } TemplateVector mergedTV = new SOMLibTemplateVector(0, mergedFeatures.toArray(new String[mergedFeatures.size()])); for (TemplateVector templateVector : tvs) { for (int k = 0; k < templateVector.dim(); k++) { if (mergedTV.containsLabel(templateVector.getLabel(k))) { final int index = mergedTV.getIndexOfFeature(templateVector.getLabel(k)); mergedTV.getElement(index).mergeStatiscticsWithOtherElement(templateVector.getElement(k)); mergedTV.incNumVectors(templateVector.numVectors()); } } } // with the correct term statistics, we can write the template vector file InputDataWriter.writeAsSOMLib(mergedTV, outFile + ".tv"); // start writing the vector, first open a class-info file to add vectors too later SOMLibClassInformation classInfo = new SOMLibClassInformation(); PrintWriter writer = FileUtils.openFileForWriting("Input vector file", outFile + ".vec", true); InputDataWriter.writeHeaderToFile(writer, totalVectorCount, mergedFeatures.size()); for (int i = 0; i < inputVecs.length; i++) { System.out.println("processing: " + inputVecs[i] + InputData.inputFileNameSuffix); final InputData inputData = new SOMLibSparseInputData(inputVecs[i] + InputData.inputFileNameSuffix); inputData.setTemplateVector(tvs[i]); final TemplateVector templateVector = tvs[i]; final String classname = inputVecs[i]; for (int j = 0; j < inputData.numVectors(); j++) { final InputDatum inputDatum = inputData.getInputDatum(j); final DoubleMatrix1D originalVector = inputDatum.getVector(); SparseDoubleMatrix1D newVector = new SparseDoubleMatrix1D(mergedTV.dim()); boolean[] presentTerms = VectorTools.createBooleanArray(mergedTV.dim(), false); for (int k = 0; k < templateVector.dim(); k++) { if (mergedTV.containsLabel(templateVector.getLabel(k))) { final int index = mergedTV.getIndexOfFeature(templateVector.getLabel(k)); newVector.setQuick(index, originalVector.getQuick(k)); presentTerms[index] = true; } } for (int k = 0; k < presentTerms.length; k++) { if (!presentTerms[k]) { newVector.setQuick(k, Double.NaN); } } // newVector = VectorTools.normaliseByLength(newVector.toArray()); InputDataWriter.writeInputDatumToFile(writer, inputDatum.getLabel(), newVector); writer.flush(); classInfo.addItem(inputDatum.getLabel(), classname); } } writer.close(); // now we can write the class-info file too (vectors got still added in the loop) classInfo.processItems(true); InputDataWriter.writeAsSOMLib(classInfo, outFile + ".cls"); // finally, write a properties file if (size == null) { size = VectorTools.computeDefaultSize(totalVectorCount, 7); } try { new SOMProperties(size[0], size[1], SOMLibSparseInputData.DEFAULT_RANDOM_SEED, 20, 0, SOMProperties.defaultLearnRate, -1d, 1d, "MissingValueMetricWrapper", false).writeToFile(outFile, ".", true); } catch (PropertiesException e) { e.printStackTrace(); System.exit(-1); } } }