/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package di.uniba.it.tri.script; import di.uniba.it.tri.api.Tri; import di.uniba.it.tri.vectors.ObjectVector; import di.uniba.it.tri.vectors.ReverseObjectVectorComparator; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.util.Collections; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; import org.apache.lucene.store.FSDirectory; /** * * @author pierpaolo */ public class FindVariation { private static final Logger LOG = Logger.getLogger(FindVariation.class.getName()); static Options options; static CommandLineParser cmdParser = new BasicParser(); static { options = new Options(); options.addOption("d", true, "TIR directory") .addOption("min", true, "Min threshold (optional)") .addOption("max", true, "Max threshold (optional)") .addOption("idx", true, "Index (optional)") .addOption("f", true, "Index field name (optional, default value 'content')") .addOption("o", true, "Output file"); } /** * @param args the command line arguments */ public static void main(String[] args) { try { CommandLine cmd = cmdParser.parse(options, args); if (cmd.hasOption("d") && cmd.hasOption("o")) { Tri tri = new Tri(); tri.setMaindir(cmd.getOptionValue("d")); double min = Double.parseDouble(cmd.getOptionValue("min", "0")); double max = Double.parseDouble(cmd.getOptionValue("max", "1")); String fieldname = cmd.getOptionValue("f", "content"); DirectoryReader reader = null; if (cmd.hasOption("idx")) { LOG.info("Open index..."); reader = DirectoryReader.open(FSDirectory.open(new File(cmd.getOptionValue("idx")))); } LOG.info("Init TRI..."); List<String> years = tri.year(0, Integer.MAX_VALUE); for (String y : years) { tri.load("mem", y, y); } Collections.sort(years); int y = 0; while (y < years.size() - 1) { LOG.log(Level.INFO, "Computing variation {0}_{1}", new Object[]{years.get(y), years.get(y + 1)}); List<ObjectVector> results = tri.sims(years.get(y), years.get(y + 1), Integer.MAX_VALUE, min, max); if (reader != null) { for (ObjectVector ov : results) { double ds = (double) reader.docFreq(new Term(fieldname, ov.getKey())) / (double) reader.maxDoc(); ov.setScore(ov.getScore() * ds); } } Collections.sort(results, new ReverseObjectVectorComparator()); LOG.info("Store..."); BufferedWriter writer = new BufferedWriter(new FileWriter(cmd.getOptionValue("o") + "_" + years.get(y) + "_" + years.get(y + 1))); for (ObjectVector ov : results) { writer.append(ov.getKey()).append("\t").append(String.valueOf(ov.getScore())); writer.newLine(); } writer.close(); y++; } } else { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.printHelp("Extract the list of words that change meaning", options, true); } } catch (Exception ex) { LOG.log(Level.SEVERE, null, ex); } } }