/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright 2, 2015nership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package eu.project.ttc.tools.cli; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.nio.file.Paths; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.regex.Pattern; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.uima.UIMAException; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Joiner; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.base.Stopwatch; import com.google.common.collect.Lists; import eu.project.ttc.engines.cleaner.TermProperty; import eu.project.ttc.engines.desc.Lang; import eu.project.ttc.engines.desc.TermSuiteCollection; import eu.project.ttc.models.OccurrenceType; import eu.project.ttc.models.TermIndex; import eu.project.ttc.tools.TermSuitePipeline; import eu.project.ttc.tools.TermSuiteResourceManager; import eu.project.ttc.utils.FileUtils; import eu.project.ttc.utils.TermUtils; /** * Command line interface for the Terminology extraction (Spotter+Indexer) engines. * * @author Damien Cram */ public class TermSuiteTerminoCLI { private enum CollectionMode {ISTEX_API, FILESYSTEM, INLINE_TEXT} private static final Logger LOGGER = LoggerFactory.getLogger(TermSuiteTerminoCLI.class); /** Short usage description of the CLI */ private static final String USAGE = "java [-DconfigFile=<file>] -Xms1g -Xmx2g -cp termsuite-core-x.x.jar eu.project.ttc.tools.cli.TermSuiteTerminoCLI"; /// Parameter names /** Name of the example limit parament */ private static final String TEXT = "text"; /** Name of the watch parameter */ private static final String WATCH = "watch"; /** Name of the corpus parameter */ private static final String PATH_TO_CORPUS = "corpus-home"; /** Name of the resource path parameter */ private static final String PATH_TO_RESOURCE_PACK = "resource-pack"; /** Name of the corpus format parameter */ private static final String CORPUS_FORMAT = "corpus-format"; /** Name of the parameter that must be set to the tt dir */ public static final String P_TAGGER_HOME_DIRECTORY = "tagger-home"; /** Name of the parameter that must be set to disable graphical variants */ private static final String GRAPHICAL_SIMILARITY = "graphical-similarity-th"; /** Name of the paramter that shows tree tagger tags**/ private static final String SHOW_TAGGER_TAGS = "tags"; /** Compost configuration parameters **/ private static final String COMPOST_COEFF = "compost-coeff"; private static final String COMPOST_MIN_COMPONENT_SIZE = "compost-min-component-size"; private static final String COMPOST_MAX_COMPONENT_NUM = "compost-max-component-num"; private static final String COMPOST_SIMILARITY_THRESHOLD = "compost-similarity-threshold"; private static final String COMPOST_SCORE_THRESHOLD = "compost-score-threshold"; /** deactivate the occurrences saving in memory while indexing **/ private static final String NO_OCCURRENCE = "no-occurrence"; /** MongoDB parameters **/ private static final String MONGODB_STORE = "mongodb-store"; private static final String MONGODB_SOFT_LINK = "json-mongodb-soft-link"; /** ISTEX API Parameter **/ private static final String ISTEX_API_URL = "istex-api"; private static final String ISTEX_ID_FILE = "istex-id-file"; private static final String ISTEX_ID = "istex-id"; /* * Collection mode */ private CollectionMode collectionMode = CollectionMode.FILESYSTEM; /* * The mongo db options */ private Optional<String> mongoStoreDBURL = Optional.absent(); private boolean mongoStoreSoftLinked = false; /** Mate tagger parameter **/ private static final String MATE = "mate"; /* * With Mate */ private static enum Tagger{Mate, TreeTagger}; /* * Logging arguments */ private static final String DEBUG = "debug"; private static final String TRACE = "trace"; private static final String NO_LOGGING = "no-logging"; /* * Contextualizer */ private static final String CONTEXTUALIZE = "contextualize"; private static final String CONTEXTUALIZE_ALL_TERMS = "contextualize-all-terms"; private static final String CONTEXT_SCOPE = "context-scope"; private static final String ALLOW_MWT_IN_CONTEXTS = "allow-mwts-in-contexts"; /* * Cleaning arguments */ private static final String CLEAN_THRESHOLD = "filter-th"; private static final String CLEAN_TOP_N = "filter-top-n"; private static final String CLEAN_PROPERTY = "filter-property"; private static final String CLEAN_FILTER_VARIANTS = "filter-variants"; /* * Max size filtering */ private static final String PERIODIC_FILTER_PROPERTY = "periodic-filter-property"; private static final String PERIODIC_FILTER_MAX_SIZE = "periodic-filter-max-size"; // the tsv file path argument private static final String TSV = "tsv"; private static final String TSV_PROPERTIES = "tsv-properties"; private static final String TSV_VARIANT_SCORES = "tsv-show-scores"; // the json file path argument private static final String JSON = "json"; // the tbx file path argument private static final String TBX = "tbx"; // the jsonCAS file path argument private static final String JSCASFILE = "jsonCasFile"; // tagger argument private Tagger tagger = Tagger.TreeTagger; private Optional<String> resourcePack = Optional.absent(); private String corpusPath = null; private Lang language = null; private String encoding = "UTF-8"; // private static String pipelineCRInputDirectory = null; private String taggerHome = ""; private String inlineText = null; private TermSuiteCollection corpusType = TermSuiteCollection.TXT; private float graphicalSimilarityThreshold = 0.9f; /* * Istex parameters */ private Optional<String> istexAPIUrl = Optional.absent(); private Optional<List<String>> istexIds = Optional.absent(); /* * contetxualizer */ private boolean contextualize = false; private boolean contextualizeAllTerms = false; private boolean allowMWTInContexts = false; private int contextScope = 3; /* * Cleaning parameters */ private Optional<Float> cleaningThreshold = Optional.of(2f); private Optional<Integer> cleaningTopN = Optional.absent(); private Optional<TermProperty> cleaningProperty = Optional.of(TermProperty.WR_LOG); private boolean keepVariantsWhileCleaning = true; /* * Max size periodic filtering */ private Optional<TermProperty> periodicFilteringProperty = Optional.absent(); private int maxSizeFilteringMaxSize = 20000; /* * Spotter params */ private boolean spotWithOccurrences = true; /* * Export params */ private Optional<String> tsvFile = Optional.absent(); private Optional<TermProperty[]> tsvProperties = Optional.absent(); private boolean tsvShowVariantScores = false; private Optional<String> jsonFile = Optional.absent(); private Optional<String> tbxFile = Optional.absent(); private Optional<String> jsonCasFile = Optional.absent(); /* * compost params */ private Optional<Float> compostAlpha = Optional.absent(); private Optional<Float> compostBeta = Optional.absent(); private Optional<Float> compostGamma = Optional.absent(); private Optional<Float> compostDelta = Optional.absent(); private Optional<Integer> compostMinComponentSize = Optional.absent(); private Optional<Integer> compostMaxComponentNum = Optional.absent(); private Optional<Float> compostSimilarityThreshold = Optional.of(1f); private Optional<Float> compostScoreThreshold = Optional.absent(); /* * Ouput and display params */ private static Optional<Pattern> watch = Optional.absent(); /** * Application entry point * * @param args * Command line arguments * @throws UnsupportedEncodingException */ public static void main(String[] args) throws Exception { String logPath = Paths.get("logs", "termsuite-" + new SimpleDateFormat("yyyyMMdd-HHmmss").format(new Date()) +".log").toAbsolutePath().toString(); TermSuiteCLIUtils.logToFile(logPath); File logDir = new File("logs"); if(!logDir.exists()) logDir.mkdir(); LOGGER.info("Logging to {}", logPath); TermSuiteTerminoCLI cli = new TermSuiteTerminoCLI(); cli.run(args); } private void run(String[] args) throws IOException, UIMAException, UnsupportedEncodingException { Stopwatch sw = Stopwatch.createStarted(); // create the Options Options options = declareOptions(); try { // Parse and set CL options CommandLine line = new PosixParser().parse(options, args, false); readArguments(line); if(line.hasOption(NO_LOGGING)) TermSuiteCLIUtils.disableLogging(); else if(line.hasOption(DEBUG)) TermSuiteCLIUtils.setGlobalLogLevel("debug"); else if(line.hasOption(TRACE)) TermSuiteCLIUtils.setGlobalLogLevel("trace"); else TermSuiteCLIUtils.setGlobalLogLevel("info"); TermSuiteCLIUtils.logCommandLineOptions(line); TermSuitePipeline pipeline = TermSuitePipeline.create(language.getCode()); switch(collectionMode) { case INLINE_TEXT: pipeline.setInlineString(inlineText); break; case FILESYSTEM: pipeline.setCollection(corpusType, corpusPath, encoding); break; case ISTEX_API: pipeline.setIstexCollection(istexAPIUrl.get(), istexIds.get()); break; } // resource if(resourcePack.isPresent()) { if(resourcePack.get().endsWith(".jar")) pipeline.setResourceJar(resourcePack.get()); else pipeline.setResourceDir(resourcePack.get()); } // mongodb if(mongoStoreDBURL.isPresent()) pipeline.setMongoDBOccurrenceStore(mongoStoreDBURL.get()); // tokenizer pipeline.aeWordTokenizer(); // tagger if(tagger == Tagger.TreeTagger) pipeline.setTreeTaggerHome(taggerHome) .aeTreeTagger(); else if(tagger == Tagger.Mate) pipeline.setMateModelPath(taggerHome) .aeMateTaggerLemmatizer(); // Filter urlsFilter pipeline.aeUrlFilter(); // stemmer pipeline.aeStemmer(); // regex spotter pipeline.setSpotWithOccurrences(spotWithOccurrences); pipeline.aeRegexSpotter(); //export Json CAS spotter if(jsonCasFile.isPresent()) pipeline.haeJsonCasExporter(jsonCasFile.get()); // filter stop words pipeline.aeStopWordsFilter(); // specificity computer pipeline.aeSpecificityComputer(); // compost (morphology) if(compostAlpha.isPresent()) pipeline.setCompostCoeffs(compostAlpha.get(), compostBeta.get(), compostGamma.get(), compostDelta.get()); if(compostMinComponentSize.isPresent()) pipeline.setCompostMinComponentSize(compostMinComponentSize.get()); if(compostMaxComponentNum.isPresent()) pipeline.setCompostMaxComponentNum(compostMaxComponentNum.get()); if(compostScoreThreshold.isPresent()) pipeline.setCompostScoreThreshold(compostScoreThreshold.get()); if(compostSimilarityThreshold.isPresent()) pipeline.setCompostSegmentSimilarityThreshold(compostSimilarityThreshold.get()); pipeline.aeCompostSplitter(); // syntactic variant gathering pipeline.aeSyntacticVariantGatherer(); // graphical variant gathering pipeline.setGraphicalVariantSimilarityThreshold(graphicalSimilarityThreshold); pipeline.aeGraphicalVariantGatherer(); if(periodicFilteringProperty.isPresent()) pipeline.aeMaxSizeThresholdCleaner(periodicFilteringProperty.get(), maxSizeFilteringMaxSize); // contextualize if(contextualize) { pipeline .setContextualizeCoTermsType(allowMWTInContexts ? OccurrenceType.ALL : OccurrenceType.SINGLE_WORD) .aeContextualizer(contextScope, contextualizeAllTerms); } pipeline.aeExtensionDetector() .aeScorer() .aeRanker(TermProperty.SPECIFICITY, true); // filtering if(cleaningThreshold.isPresent()) { pipeline.setKeepVariantsWhileCleaning(keepVariantsWhileCleaning); pipeline.aeThresholdCleaner( cleaningProperty.get(), cleaningThreshold.get()); } else if(cleaningTopN.isPresent()) { pipeline.setKeepVariantsWhileCleaning(keepVariantsWhileCleaning); pipeline.aeTopNCleaner(cleaningProperty.get(), cleaningTopN.get()); } // stats pipeline.haeCasStatCounter("at end of pipeline"); // Export if(tsvFile.isPresent()) { if(tsvProperties.isPresent()) { pipeline.setTsvExportProperties(tsvProperties.get()); pipeline.setTsvShowScores(tsvShowVariantScores); } else pipeline.setTsvExportProperties( TermProperty.PILOT, TermProperty.FREQUENCY ); pipeline.haeTsvExporter(tsvFile.get()); } if(tbxFile.isPresent()) pipeline.haeTbxExporter(tbxFile.get()); if(jsonFile.isPresent()) { pipeline.setExportJsonWithContext(contextualize); pipeline.setExportJsonWithOccurrences(true); if(mongoStoreSoftLinked) pipeline.linkMongoStore(); pipeline.haeJsonExporter(jsonFile.get()); } // run the pipeline final String termIndexName = "ScriptTermIndex_" + System.currentTimeMillis(); if(collectionMode == CollectionMode.INLINE_TEXT) { LOGGER.info("Running TermSuite pipeline (inline mode)"); JCas cas = JCasFactory.createJCas(); cas.setDocumentText(inlineText); cas.setDocumentLanguage(language.getCode()); pipeline.run(cas); System.err.flush(); System.out.println("Term index: "); TermIndex index = (TermIndex)TermSuiteResourceManager.getInstance().get(termIndexName); TermUtils.showIndex(index, System.out, watch); } else { LOGGER.info("Running TermSuite pipeline in corpus mode"); pipeline.run(); if(watch.isPresent()) TermUtils.showIndex( (TermIndex)TermSuiteResourceManager.getInstance().get(termIndexName), new PrintStream(System.err, true, "UTF-8"), watch); } LOGGER.info("Script executed in " + sw.toString()); } catch (ParseException e) { TermSuiteCLIUtils.printUsage(e, USAGE, options); } } public Options declareOptions() { Options options = new Options(); options.addOption(TermSuiteCLIUtils.createOption( null, ISTEX_API_URL, true, "URL to the istex API", false)); options.addOption(TermSuiteCLIUtils.createOption( null, ISTEX_ID_FILE, true, "File containing the list of Istex document ids (one per line).", false)); options.addOption(TermSuiteCLIUtils.createOption( null, ISTEX_ID, true, "List of comma-separated Istex docuement ids", false)); options.addOption(TermSuiteCLIUtils.createOption( null, NO_OCCURRENCE, false, "Deactivate the occurrence store in memory (recommended for big corpus).", false)); options.addOption(TermSuiteCLIUtils.createOption( null, PERIODIC_FILTER_PROPERTY, true, "Activate a periodic cleaning of the on-going terminology by a given property.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, PERIODIC_FILTER_MAX_SIZE, true, "The maximum allowed size of the on-going terminology in memory.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, MATE, false, "Use Mate tagger instead of TreeTagger.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, TEXT, true, "The text to analyze", false)); options.addOption(TermSuiteCLIUtils.createOption( null, COMPOST_MAX_COMPONENT_NUM, true, "The maximum number of components that a compound can have", false)); options.addOption(TermSuiteCLIUtils.createOption( null, COMPOST_MIN_COMPONENT_SIZE, true, "The minimum size allowed in a component", false)); options.addOption(TermSuiteCLIUtils.createOption( null, COMPOST_SCORE_THRESHOLD, true, "The segmentation score threshold of COMPOST algo.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, COMPOST_SIMILARITY_THRESHOLD, true, "The segment similarity threshold above which an existing string in COMPOST index is considered as recognized.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, COMPOST_COEFF, true, "COMPOST alpha, beta, gamma and delta parameters, separated with a hyphen \"-\". Sum must be 1", false)); options.addOption(TermSuiteCLIUtils.createOption( null, NO_LOGGING, false, "Disable logging", false)); options.addOption(TermSuiteCLIUtils.createOption( null, DEBUG, false, "fine-grained logging", false)); options.addOption(TermSuiteCLIUtils.createOption( null, TRACE, false, "very fine grained logging", false)); options.addOption(TermSuiteCLIUtils.createOption( null, CONTEXTUALIZE, false, "Enable the contextualizer. Compute a context vector for each SWT term.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, CONTEXTUALIZE_ALL_TERMS, false, "Compute a context vector for MWTs too.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, ALLOW_MWT_IN_CONTEXTS, false, "Allow to set MWTs as cooccurrences in context vectors.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, CONTEXT_SCOPE, true, "The window size for term contexts capture", false)); options.addOption(TermSuiteCLIUtils.createOption( null, CORPUS_FORMAT, true, "The file format in the input corpus. txt and tei supported", false)); options.addOption(TermSuiteCLIUtils.createOption( "c", PATH_TO_CORPUS, true, "Path to the corpus", false)); options.addOption(TermSuiteCLIUtils.createOption( "r", PATH_TO_RESOURCE_PACK, true, "Path to the TermSuite resource pack", false)); options.addOption(TermSuiteCLIUtils.createOption( "l", TermSuiteCLIUtils.P_LANGUAGE, true, "language of the input files: fr/en/etc.", true)); options.addOption(TermSuiteCLIUtils.createOption( null, TermSuiteCLIUtils.P_ENCODING, true, "encoding of the input files", false)); options.addOption(TermSuiteCLIUtils.createOption( "t", P_TAGGER_HOME_DIRECTORY, true, "TreeTagger home directory or Mate model directory", true)); options.addOption(TermSuiteCLIUtils.createOption( null, GRAPHICAL_SIMILARITY, false, "The similarity threshold (a value between 0 and 1, 0.9 advised) for graphical variant gathering.", false)); options.addOption(TermSuiteCLIUtils.createOption( null, SHOW_TAGGER_TAGS, false, "Show tree tagger tags", false)); options.addOption( null, WATCH, true, "Show infos about terms matching this string"); options.addOption( null, CLEAN_PROPERTY, true, "The name of the term property used for cleaning filtering the term index"); options.addOption( null, CLEAN_FILTER_VARIANTS, false, "Also filter variants with terms."); options.addOption( null, CLEAN_THRESHOLD, true, "The filtering threshold"); options.addOption( null, CLEAN_TOP_N, true, "The number of terms to keep after filtering"); options.addOption( null, TSV, true, "The tsv file path where to export the term index"); options.addOption( null, TSV_PROPERTIES, true, "comma-separated list of term properties to export as a column in TSV file"); options.addOption( null, TSV_VARIANT_SCORES, false, "shows variant scores next to the \"V\" label"); options.addOption( null, TBX, true, "The tbx file path where to export the term index"); options.addOption( null, JSON, true, "The json file path where to export the term index"); options.addOption( null, JSCASFILE, true, "The directory path where to export the TreeTagger token of each files give in entry of TermSuite in " + "Json Format"); options.addOption( null, MONGODB_STORE, true, "The mongo db url of the database where to store the occurrences"); options.addOption( null, MONGODB_SOFT_LINK, false, "shows variant scores next to the \"V\" label"); return options; } public void readArguments(CommandLine line) throws IOException { /* * Collection Reader arguments */ if(line.hasOption(ISTEX_API_URL)) { collectionMode = CollectionMode.ISTEX_API; istexAPIUrl = Optional.of(line.getOptionValue(ISTEX_API_URL)); List<String> ids = Lists.newLinkedList(); if(line.hasOption(ISTEX_ID_FILE)) { ids = FileUtils.getUncommentedLines( new File(line.getOptionValue(ISTEX_ID_FILE)), Charset.forName("UTF-8")); } else if(line.hasOption(ISTEX_ID)) { ids = Splitter.on(",").splitToList(line.getOptionValue(ISTEX_ID)); } else TermSuiteCLIUtils.exitWithErrorMessage("On argument of --" + ISTEX_ID_FILE + ", --" + ISTEX_ID + " must be set."); istexIds = Optional.of(ids); } else if(line.hasOption(TEXT)) { inlineText = line.getOptionValue(TEXT); if(inlineText == null) inlineText = TermSuiteCLIUtils.readIn(encoding); collectionMode = CollectionMode.INLINE_TEXT; } else if(line.hasOption(PATH_TO_CORPUS)) { corpusPath = line.getOptionValue(PATH_TO_CORPUS); collectionMode = CollectionMode.FILESYSTEM; } else TermSuiteCLIUtils.exitWithErrorMessage("On argument of --" + TEXT + ", --" + PATH_TO_CORPUS + ", --" + ISTEX_API_URL + " must be set."); if(line.hasOption(PATH_TO_RESOURCE_PACK)) resourcePack = Optional.of(line.getOptionValue(PATH_TO_RESOURCE_PACK)); if(line.hasOption(NO_OCCURRENCE)) spotWithOccurrences = false; language = Lang.forName(line.getOptionValue(TermSuiteCLIUtils.P_LANGUAGE)); encoding = line.getOptionValue(TermSuiteCLIUtils.P_ENCODING, "UTF-8"); taggerHome = line.getOptionValue(P_TAGGER_HOME_DIRECTORY); if(line.hasOption(CORPUS_FORMAT)) { if(line.getOptionValue(CORPUS_FORMAT).equals(TermSuiteCollection.TEI.name())) { corpusType = TermSuiteCollection.TEI; } else if(line.getOptionValue(CORPUS_FORMAT).equals(TermSuiteCollection.TXT.name())) { corpusType = TermSuiteCollection.TXT; } else TermSuiteCLIUtils.exitWithErrorMessage("Unknown corpus format: " + line.getOptionValue(CORPUS_FORMAT) + ". Supported formats: " + Joiner.on(',').join(TermSuiteCollection.values())); } // pipelineCRInputDirectory = TermSuiteCLIUtils.getCorpusLanguagePath(corpusPath, language, corpusType.name().toLowerCase()); if(line.hasOption(GRAPHICAL_SIMILARITY)) graphicalSimilarityThreshold = Float.parseFloat(line.getOptionValue(GRAPHICAL_SIMILARITY)); if(line.hasOption(COMPOST_MIN_COMPONENT_SIZE)) compostMinComponentSize = Optional.of(Integer.parseInt(line.getOptionValue(COMPOST_MIN_COMPONENT_SIZE))); if(line.hasOption(COMPOST_MAX_COMPONENT_NUM)) compostMaxComponentNum = Optional.of(Integer.parseInt(line.getOptionValue(COMPOST_MAX_COMPONENT_NUM))); if(line.hasOption(COMPOST_SCORE_THRESHOLD)) compostScoreThreshold = Optional.of(Float.parseFloat(line.getOptionValue(COMPOST_SCORE_THRESHOLD))); if(line.hasOption(WATCH)) watch = Optional.of(Pattern.compile(line.getOptionValue(WATCH))); if(line.hasOption(COMPOST_SIMILARITY_THRESHOLD)) compostSimilarityThreshold = Optional.of(Float.parseFloat(line.getOptionValue(COMPOST_SIMILARITY_THRESHOLD))); if(line.hasOption(COMPOST_COEFF)) { List<String> strings = Splitter.on('-').splitToList(line.getOptionValue(COMPOST_COEFF)); compostAlpha = Optional.of(Float.parseFloat(strings.get(0))); compostBeta = Optional.of(Float.parseFloat(strings.get(1))); compostGamma = Optional.of(Float.parseFloat(strings.get(2))); compostDelta = Optional.of(Float.parseFloat(strings.get(3))); Preconditions.checkArgument( 1.0f == compostAlpha.get() + compostBeta.get() + compostGamma.get() + compostDelta.get(), String.format("The sum of Compost coeffs must be 1 (%3.2f+%3.2f+%3.2f+%3.2f=%3.2f)", compostAlpha.get(), compostBeta.get(), compostGamma.get(), compostDelta.get(), compostAlpha.get() + compostBeta.get() + compostGamma.get() + compostDelta.get() ) ); } /* * Contextualizer */ contextualize = line.hasOption(CONTEXTUALIZE); allowMWTInContexts = line.hasOption(ALLOW_MWT_IN_CONTEXTS); contextualizeAllTerms = line.hasOption(CONTEXTUALIZE_ALL_TERMS); if(line.hasOption(CONTEXT_SCOPE)) { contextScope = Integer.parseInt(line.getOptionValue(CONTEXT_SCOPE)); } Preconditions.checkArgument( !(line.hasOption(CLEAN_TOP_N) && line.hasOption(CLEAN_THRESHOLD)), "%s and %s cannot be set together", CLEAN_TOP_N, CLEAN_THRESHOLD); if(line.hasOption(CLEAN_THRESHOLD)) { cleaningThreshold = Optional.of(Float.parseFloat(line.getOptionValue(CLEAN_THRESHOLD))); cleaningTopN = Optional.absent(); } if(line.hasOption(CLEAN_TOP_N)) { cleaningTopN = Optional.of(Integer.parseInt(line.getOptionValue(CLEAN_TOP_N))); cleaningThreshold = Optional.absent(); } if(line.hasOption(CLEAN_PROPERTY)) { Preconditions.checkArgument( line.hasOption(CLEAN_TOP_N) || line.hasOption(CLEAN_THRESHOLD), "One of %s or %s must be set together with %s", CLEAN_TOP_N, CLEAN_THRESHOLD, CLEAN_PROPERTY); cleaningProperty = Optional.of(TermProperty.forName(line.getOptionValue(CLEAN_PROPERTY))); } if(line.hasOption(CLEAN_FILTER_VARIANTS)) keepVariantsWhileCleaning = false; if(line.hasOption(PERIODIC_FILTER_PROPERTY)) { periodicFilteringProperty = Optional.of(TermProperty.forName(line.getOptionValue(PERIODIC_FILTER_PROPERTY))); if(line.hasOption(PERIODIC_FILTER_MAX_SIZE)) maxSizeFilteringMaxSize = Integer.parseInt(line.getOptionValue(PERIODIC_FILTER_MAX_SIZE).trim()); } if(line.hasOption(TSV)) tsvFile = Optional.of(line.getOptionValue(TSV)); if(line.hasOption(TSV_PROPERTIES)) { List<TermProperty> list = Lists.newArrayList(); for(String pName:Splitter.on(",").split(line.getOptionValue(TSV_PROPERTIES))) { list.add(TermProperty.forName(pName)); } TermProperty[] ary = new TermProperty[list.size()]; tsvProperties = Optional.of(list.toArray(ary)); } if(line.hasOption(TSV_VARIANT_SCORES)) tsvShowVariantScores = true; if(line.hasOption(TBX)) tbxFile = Optional.of(line.getOptionValue(TBX)); if(line.hasOption(JSON)) jsonFile = Optional.of(line.getOptionValue(JSON)); if(line.hasOption(JSCASFILE)) jsonCasFile = Optional.of(line.getOptionValue(JSCASFILE)); if(line.hasOption(MATE)) tagger = Tagger.Mate; if(line.hasOption(MONGODB_STORE)) mongoStoreDBURL = Optional.of(line.getOptionValue(MONGODB_STORE)); if(line.hasOption(MONGODB_SOFT_LINK)) { Preconditions.checkArgument(line.hasOption(MONGODB_STORE), "The option %s requires the option %s", MONGODB_SOFT_LINK, MONGODB_STORE); mongoStoreSoftLinked = true; } } }