/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright 2, 2015nership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package eu.project.ttc.tools.cli; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.nio.file.Paths; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Optional; import com.google.common.base.Splitter; import com.google.common.base.Stopwatch; import com.google.common.collect.Lists; import eu.project.ttc.api.JsonOptions; import eu.project.ttc.engines.BilingualAligner; import eu.project.ttc.engines.BilingualAligner.TranslationCandidate; import eu.project.ttc.metrics.Cosine; import eu.project.ttc.metrics.Jaccard; import eu.project.ttc.metrics.SimilarityDistance; import eu.project.ttc.models.Term; import eu.project.ttc.models.TermIndex; import eu.project.ttc.models.index.JsonTermIndexIO; import eu.project.ttc.tools.TermSuiteAlignerBuilder; /** * Command line interface for the Terminology extraction (Spotter+Indexer) engines. * * @author Damien Cram */ public class TermSuiteAlignerCLI { private static final Logger LOGGER = LoggerFactory.getLogger(TermSuiteAlignerCLI.class); /** Short usage description of the CLI */ private static final String USAGE = "java [-DconfigFile=<file>] -Xms1g -Xmx2g -cp termsuite-core-x.x.jar eu.project.ttc.tools.cli.TermSuiteAlignerCLI"; /// Parameter names private static final String TERM = "term"; private static final String N = "n"; private static final String TERM_LIST = "term-list"; private static final String SOURCE_TERMINO = "source-termino"; private static final String TARGET_TERMINO = "target-termino"; private static final String DICTIONARY = "dictionary"; private static final String DISTANCE = "distance"; private static final String EXPLAIN = "explain"; // Parameter options private static final String DISTANCE_COSINE = "cosine"; private static final String DISTANCE_JACCARD = "jaccard"; // values private Optional<TermIndex> sourceTermino = Optional.absent(); private Optional<TermIndex> targetTermino = Optional.absent(); private String dicoPath; private int n = 10; private List<String> terms = Lists.newArrayList(); private SimilarityDistance distance = new Cosine(); private boolean showExplanation = false; /** * Application entry point * * @param args * Command line arguments * @throws UnsupportedEncodingException */ public static void main(String[] args) throws UnsupportedEncodingException { new TermSuiteAlignerCLI().run(args, System.out); } public void run(String[] args, PrintStream out) { File logDir = new File("logs"); if(!logDir.exists()) logDir.mkdir(); String logPath = Paths.get("logs", "termsuite-aligner-" + new SimpleDateFormat("yyyyMMdd-HHmmss").format(new Date()) +".log").toAbsolutePath().toString(); TermSuiteCLIUtils.logToFile(logPath); Stopwatch sw = Stopwatch.createStarted(); LOGGER.info("Logging to {}", logPath); try { // usage // java -DconfigFile=myPropertiesFileName -Xms1g -Xmx2g -cp ttc-term-suite-1.3.jar eu.project.ttc.tools.cli.TermSuiteSpotterCLI // if the option -DconfigFile is missing preferencesFileName is set to TermSuiteCLIUtils.USER_HOME+PREFERENCES_FILE_NAME // create the command line parser PosixParser parser = new PosixParser(); // create the Options Options options = declareOptions(); try { // Parse and set CL options CommandLine line = parser.parse(options, args, false); readArguments(line, out); TermSuiteCLIUtils.setGlobalLogLevel("info"); TermSuiteCLIUtils.logCommandLineOptions(line); BilingualAligner aligner = TermSuiteAlignerBuilder.start() .setSourceTerminology(sourceTermino.get()) .setTargetTerminology(targetTermino.get()) .setDicoPath(dicoPath) .setDistance(distance) .create(); for(String term:terms) { Term sourceTerm = readSourceTerm(term); if(sourceTerm == null) { LOGGER.error("Cannot find term \"{}\" in {}", term, line.getOptionValue(SOURCE_TERMINO)); } else { if(terms.size() > 1) { out.println("---"); out.println(sourceTerm); out.println("-"); } for(TranslationCandidate candidate:aligner.align(sourceTerm, n, 1)) { if(showExplanation) out.format("%s\t%.3f\t%s\t%s\n", candidate.getTerm(), candidate.getScore(), candidate.getMethod(), candidate.getExplanation().getText()); else out.format("%s\t%.3f\t%s\n", candidate.getTerm(), candidate.getScore(), candidate.getMethod()); } } } LOGGER.info("Script executed in " + sw.toString()); } catch (ParseException e) { TermSuiteCLIUtils.printUsage(e, USAGE, options); } } catch (Exception e) { e.printStackTrace(System.err); LOGGER.error(e.getMessage()); } } private Term readSourceTerm(String term) { for(Term t:sourceTermino.get().getTerms()) { if(t.getGroupingKey().equals(term) || t.getPilot().equals(term) || t.getLemma().equals(term) || t.getPilot().equals(term.toLowerCase()) || t.getLemma().equals(term.toLowerCase())) return t; } return null; } private Options declareOptions() { Options options = new Options(); options.addOption(TermSuiteCLIUtils.createOption( null, SOURCE_TERMINO, true, "Source terminology (json file)", true)); options.addOption(TermSuiteCLIUtils.createOption( null, TARGET_TERMINO, true, "Target terminology (json file)", true)); options.addOption(TermSuiteCLIUtils.createOption( null, TERM, true, "Source term to align", false)); options.addOption(TermSuiteCLIUtils.createOption( null, TERM_LIST, true, "File containing a list of source terms to align (one per line)", false)); options.addOption(TermSuiteCLIUtils.createOption( null, DICTIONARY, true, "Alignment dictionaries", true)); options.addOption(TermSuiteCLIUtils.createOption( N, N, true, "The number of translation candidates to show in the output", false)); options.addOption(TermSuiteCLIUtils.createOption( null, DISTANCE, true, "Similarity measure to compute the distance between two vectors ["+DISTANCE_COSINE+","+DISTANCE_JACCARD+"]", false)); options.addOption(TermSuiteCLIUtils.createOption( null, EXPLAIN, false, "Shows for each aligned term the most influencial co-terms", false)); return options; } public void readArguments(CommandLine line, PrintStream out) throws IOException { if(!line.hasOption(TERM) && !line.hasOption(TERM_LIST)) { String msg = String.format("ERROR: One option of --%s or --%s must be provided.", TERM, TERM_LIST); LOGGER.error(msg); System.err.flush(); out.flush(); System.err.println(msg); System.exit(1); } if(line.hasOption(TERM)) terms.add(line.getOptionValue(TERM)); if(line.hasOption(TERM_LIST)) { File file = new File(line.getOptionValue(TERM_LIST)); for(String term:FileUtils.readLines(file, "UTF-8")) terms.add(Splitter.on("\t").splitToList(term).get(0).trim()); } if(line.hasOption(N)) n = Integer.parseInt(line.getOptionValue(N)); if(line.hasOption(DISTANCE)) { if(line.getOptionValue(DISTANCE).equals(DISTANCE_COSINE)) distance = new Cosine(); else if(line.getOptionValue(DISTANCE).equals(DISTANCE_JACCARD)) distance = new Jaccard(); else TermSuiteCLIUtils.exitWithErrorMessage(String.format("Unknown distance: %s. Allowed values: %s;%s", line.getOptionValue(DISTANCE), DISTANCE_COSINE, DISTANCE_JACCARD)); } LOGGER.info("loading source termino {}",line.getOptionValue(SOURCE_TERMINO)); JsonOptions loadOptions = new JsonOptions().withContexts(true); sourceTermino = Optional.of( JsonTermIndexIO.load(new FileReader(line.getOptionValue(SOURCE_TERMINO)), loadOptions) ); LOGGER.info("loading target termino {}",line.getOptionValue(TARGET_TERMINO)); targetTermino = Optional.of( JsonTermIndexIO.load(new FileReader(line.getOptionValue(TARGET_TERMINO)), loadOptions) ); dicoPath = line.getOptionValue(DICTIONARY); showExplanation = line.hasOption(EXPLAIN); } }