/** * Copyright (c) 2014, the Temporal Random Indexing AUTHORS. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the University of Bari nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 * */ package di.uniba.it.tri.script; import di.uniba.it.tri.space.TemporalSpaceUtils; import di.uniba.it.tri.api.Tri; import di.uniba.it.tri.api.TriResultObject; import di.uniba.it.tri.vectors.Vector; import di.uniba.it.tri.vectors.VectorFactory; import di.uniba.it.tri.vectors.VectorReader; import di.uniba.it.tri.vectors.VectorType; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; /** * * @author pierpaolo */ public class BuildSimStatistics { static Options options; static CommandLineParser cmdParser = new BasicParser(); static { options = new Options(); options.addOption("i", true, "Input directory") .addOption("o", true, "Output file") .addOption("f", true, "Output format: plain or csv (default=plain)") .addOption("m", true, "Mode: pointwise (point) or cumulative (cum) (default=cum)"); } /** * * * @param args the command line arguments */ public static void main(String[] args) { try { CommandLine cmd = cmdParser.parse(options, args); if (cmd.hasOption("i") && cmd.hasOption("o")) { String format = cmd.getOptionValue("f", "plain"); String mode = cmd.getOptionValue("m", "cum"); if (!(format.equals("plain") || format.equals("csv"))) { throw new IllegalArgumentException("No valid format"); } if (!(mode.equals("point") || mode.equals("cum"))) { throw new IllegalArgumentException("No valid mode"); } Tri api = new Tri(); api.setMaindir(cmd.getOptionValue("i")); //load elemental vector api.load("file", null, "-1"); char sep = '\t'; List<String> years = api.year(0, Integer.MAX_VALUE); BufferedWriter writer = new BufferedWriter(new FileWriter(cmd.getOptionValue("o"))); if (format.equals("csv")) { writer.append(",word"); for (String year : years) { writer.append(","); writer.append(year); } writer.newLine(); sep = ','; } VectorReader evr = api.getStores().get(Tri.ELEMENTAL_NAME); Iterator<String> keys = evr.getKeys(); int c = 0; System.out.println(); Collections.sort(years); Map<String, VectorReader> vrmap = new HashMap<>(); for (String year : years) { System.out.println("Loading " + year); VectorReader vrd = TemporalSpaceUtils.getVectorReader(new File(cmd.getOptionValue("i")), year, true); vrd.init(); vrmap.put(year, vrd); } int dimension = evr.getDimension(); long time = System.currentTimeMillis(); int id = 0; while (keys.hasNext()) { String key = keys.next(); Vector precv = VectorFactory.createZeroVector(VectorType.REAL, dimension); List<TriResultObject> list = new ArrayList<>(); for (String ys : years) { VectorReader vr = vrmap.get(ys); Vector v = vr.getVector(key); if (v != null) { if (mode.equals("cum")) { Vector copy = precv.copy(); copy.superpose(v, 1, null); copy.normalize(); list.add(new TriResultObject(ys + "\t" + key, (float) copy.measureOverlap(precv))); precv.superpose(v, 1, null); precv.normalize(); } else { list.add(new TriResultObject(ys + "\t" + key, (float) v.measureOverlap(precv))); precv = v.copy(); } } else { list.add(new TriResultObject(ys + "\t" + key, 0)); } } if (format.equals("csv")) { writer.append(String.valueOf(id)); writer.append(sep); } writer.append(key); //list.remove(0); for (TriResultObject r : list) { if (r.getScore() >= 0) { writer.append(sep).append(String.valueOf(r.getScore())); } else { writer.append(sep).append(String.valueOf(0f)); } } writer.newLine(); c++; if (c % 10000 == 0) { System.out.println("Processed " + c + " words\t" + ((System.currentTimeMillis() - time) / 100) + " sec."); time = System.currentTimeMillis(); } id++; } writer.close(); } else { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.printHelp("Build sim matrix", options, true); } } catch (Exception ex) { Logger.getLogger(BuildSimStatistics.class.getName()).log(Level.SEVERE, null, ex); } } }