/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import tml.Configuration; import tml.annotators.Annotator; import tml.corpus.CorpusParameters; import tml.corpus.SearchResultsCorpus; import tml.corpus.TextDocument; import tml.corpus.CorpusParameters.DimensionalityReduction; import tml.corpus.CorpusParameters.TermSelection; import tml.storage.Repository; import tml.vectorspace.TermWeighting.GlobalWeight; import tml.vectorspace.TermWeighting.LocalWeight; import tml.vectorspace.operations.Operation; /** * Command line interface for TML, this is probably the easiest way to access it. * * Intended use should be: * usage: tml <options> [parameters] operation * -I Insert documents into repository. * --iannotators <arg> List of annotators to use when inserting * the documents. (e.g. PennTreeAnnotator). * --iclean Empties the repository before inserting * new ones. * --idocs <folder> The folder that contains the documens to * insert. * --imaxdocs <number> Maximum number of documents to index or * use in an operation. * -O Performs an operation on a corpus. * --oalldocs <type> Use all documents in repository as single * document corpora, it can be sentence or paragraph based. (e.g. sentence). * --obk <query> Lucene query that defines a background * knowledge on which the corpus will be projected. (e.g. "type:sentences AND * reference:Document*"). * --obkpar <parameter file> Properties file with the background * knowledge corpus parameters, if not set it will use the same as the * corpus. * --ocorpus <query> Lucene query that defines the corpus to * operate with. (e.g. "type:sentence AND reference:Document01"). * --ocpar <parameter file> Properties file with the corpus parameters * (optional). * --odim <list> Name of the Dimensionality Reduction * criteria. (e.g. VARPCT,NUM,PCT,NO). * --odimth <list> Threshold for the dim options. (e.g. * 0,1,2). * --olanczos Use Lanczos for SVD decomposition. * --operations <list> The list of operations you want to execute * on the corpus. (e.g. PassageDistances,PassageSimilarity . * --oresults <folder> Folder where to store the results. (e.g. * results/run01/). * --otsel <name> Name of the Term selection criteria * (TF,AVG_TF,DF). * --otselth <number> Threshold for the tsel criteria option. * --otwg <list> Name of the Global Weight to apply. (e.g. * None,Normal,GfIdf,Idf,Entropy). * --otwl <list> Name of the Local Weight to apply. * (e.g.Binary,TF,TFn,LOGTF). * -repo <folder> Full path of the repository folder, where * TML will retrieve (or insert) documents. (e.g. /home/user/lucene). * * @author Jorge Villalon * */ public class TmlCommandLine { private static Logger logger = Logger.getLogger(TmlCommandLine.class); private static Repository repository = null; private static CommandLine line = null; private static Options options = null; private static String repositoryFolder = null; @SuppressWarnings("static-access") public static void main(String[] args) { long time = System.nanoTime(); options = new Options(); // Repository options.addOption(OptionBuilder .withDescription("Full path of the repository folder, where TML will retrieve (or insert) documents. (e.g. /home/user/lucene).") .hasArg() .withArgName("folder") .isRequired() .create("repo")); // Verbosity options.addOption(OptionBuilder .withDescription("Verbose output in the console (it goes verbose to the log file).") .hasArg(false) .isRequired(false) .create("v")); // Operation on corpus options.addOption(OptionBuilder .hasArg(false) .withDescription("Performs an operation on a corpus.") .isRequired(false) .create("O")); // The list of operations options.addOption(OptionBuilder .withDescription("The list of operations you want to execute on the corpus. (e.g. PassageDistances,PassageSimilarity .") .hasArgs() .withValueSeparator(',') .withArgName("list") .isRequired(false) .withLongOpt("operations") .create()); // The file to store the results options.addOption(OptionBuilder .withDescription("Folder where to store the results. (e.g. results/run01/).") .hasArg() .withArgName("folder") .isRequired(false) .withLongOpt("oresults") .create()); // The corpus on which operate options.addOption(OptionBuilder .withDescription("Lucene query that defines the corpus to operate with. (e.g. \"type:sentence AND reference:Document01\").") .hasArg() .withArgName("query") .isRequired(false) .withLongOpt("ocorpus") .create()); // The corpus on which operate options.addOption(OptionBuilder .withDescription("Use all documents in repository as single document corpora, it can be sentence or paragraph based. (e.g. sentence).") .hasArgs() .withArgName("type") .isRequired(false) .withLongOpt("oalldocs") .create()); // The properties file for the corpus options.addOption(OptionBuilder .withDescription("Properties file with the corpus parameters (optional).") .hasArg() .withArgName("parameter file") .isRequired(false) .withLongOpt("ocpar") .create()); // Background knowledge corpus options.addOption(OptionBuilder .withDescription("Lucene query that defines a background knowledge on which the corpus will be projected. (e.g. \"type:sentences AND reference:Document*\").") .hasArg() .withArgName("query") .isRequired(false) .withLongOpt("obk") .create()); // Background knowledge parameters options.addOption(OptionBuilder .withDescription("Properties file with the background knowledge corpus parameters, if not set it will use the same as the corpus.") .hasArg() .withArgName("parameter file") .isRequired(false) .withLongOpt("obkpar") .create()); // Term selection String criteria = ""; for(TermSelection tsel : TermSelection.values()) { criteria += "," + tsel.name(); } criteria = criteria.substring(1); options.addOption(OptionBuilder .hasArgs() .withArgName("name") .withDescription("Name of the Term selection criteria (" + criteria + ").") .isRequired(false) .withValueSeparator(',') .withLongOpt("otsel") .create()); // Term selection threshold options.addOption(OptionBuilder .hasArgs() .withArgName("number") .withDescription("Threshold for the tsel criteria option.") .withType(Integer.TYPE) .isRequired(false) .withValueSeparator(',') .withLongOpt("otselth") .create()); // Dimensionality reduction criteria = ""; for(DimensionalityReduction dim : DimensionalityReduction.values()) { criteria += "," + dim.name(); } criteria = criteria.substring(1); options.addOption(OptionBuilder .hasArgs() .withArgName("list") .withDescription("Name of the Dimensionality Reduction criteria. (e.g. " + criteria + ").") .isRequired(false) .withValueSeparator(',') .withLongOpt("odim") .create()); // Dimensionality reduction threshold options.addOption(OptionBuilder .hasArgs() .withArgName("list") .withDescription("Threshold for the dim options. (e.g. 0,1,2).") .isRequired(false) .withValueSeparator(',') .withLongOpt("odimth") .create()); // Local weight criteria = ""; for(LocalWeight weight : LocalWeight.values()) { criteria += "," + weight.name(); } criteria = criteria.substring(1); options.addOption(OptionBuilder .hasArgs() .withArgName("list") .withDescription("Name of the Local Weight to apply. (e.g." + criteria + ").") .isRequired(false) .withValueSeparator(',') .withLongOpt("otwl") .create()); // Global weight criteria = ""; for(GlobalWeight weight : GlobalWeight.values()) { criteria += "," + weight.name(); } criteria = criteria.substring(1); options.addOption(OptionBuilder .hasArgs() .withArgName("list") .withDescription("Name of the Global Weight to apply. (e.g. " + criteria + ").") .isRequired(false) .withValueSeparator(',') .withLongOpt("otwg") .create()); // Use Lanczos options.addOption(OptionBuilder .hasArg(false) .withDescription("Use Lanczos for SVD decomposition.") .isRequired(false) .withLongOpt("olanczos") .create()); // Inserting documents in repository options.addOption(OptionBuilder .hasArg(false) .withDescription("Insert documents into repository.") .isRequired(false) .create("I")); // Max documents to insert options.addOption(OptionBuilder .hasArg() .withArgName("number") .withDescription("Maximum number of documents to index or use in an operation.") .withType(Integer.TYPE) .isRequired(false) .withLongOpt("imaxdocs") .create()); // Clean repository options.addOption(OptionBuilder .hasArg(false) .withDescription("Empties the repository before inserting new ones.") .isRequired(false) .withLongOpt("iclean") .create()); // Use annotator options.addOption(OptionBuilder .hasArgs() .withDescription("List of annotators to use when inserting the documents. (e.g. PennTreeAnnotator).") .isRequired(false) .withValueSeparator(',') .withLongOpt("iannotators") .create()); // Documents folder options.addOption(OptionBuilder .hasArg() .withArgName("folder") .withDescription("The folder that contains the documens to insert.") .isRequired(false) .withLongOpt("idocs") .create()); // Initializing the line parser CommandLineParser parser = new PosixParser(); try { line = parser.parse(options, args); } catch (ParseException e) { printHelp(options); return; } // Validate that either inserting or an operation are given if(!line.hasOption("I") && !line.hasOption("O")) { System.out.println("One of the options -I or -O must be present."); printHelp(options); return; } repositoryFolder = line.getOptionValue("repo"); try { if(line.hasOption("I")) { indexing(); } else if(line.hasOption("O")){ operation(); } } catch (ParseException e) { System.out.println(e.getMessage()); printHelp(options); return; } System.out.println("TML finished successfully in " + (System.nanoTime() - time)*10E-9 + " seconds."); return; } private static void printHelp(Options options) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("tml <options> [parameters] operation", options); } @SuppressWarnings("rawtypes") private static void indexing() throws ParseException { if(!line.hasOption("idocs")) { throw new ParseException("Indexing requires the idocs option."); } if(!startTML()) { throw new ParseException("Fatal error initializing TML."); } if(line.hasOption("iclean")) { try { Repository.cleanStorage(repositoryFolder); } catch (Exception e) { logger.error(e); return; } } try { repository = new Repository(repositoryFolder); } catch (Exception e) { e.printStackTrace(); logger.error(e); return; } // Remove all annotators because in command line mode they must be added one by one for(int i=repository.getAnnotators().size()-1; i>=0; i--) { Annotator annotator = repository.getAnnotators().get(i); repository.removeAnnotator(annotator); } String[] annotatorsList = line.getOptionValues("iannotators"); if(annotatorsList != null && annotatorsList.length > 0) { for(String annotatorName : annotatorsList) { Class classDefinition = null; Annotator annotator = null; try { classDefinition = Class.forName("tml.annotators." + annotatorName); annotator = (Annotator) classDefinition.newInstance(); } catch (Exception e) { logger.error("The annotator wasn't found! " + annotatorName); logger.error(e); continue; } repository.addAnnotator(annotator); } } String documentsFolder = line.getOptionValue("idocs"); try { if(line.hasOption("imaxdocs")) { int maxDocs = Integer.parseInt(line.getOptionValue("imaxdocs")); repository.addDocumentsInFolder(documentsFolder, maxDocs); } else { repository.addDocumentsInFolder(documentsFolder); } } catch (IOException e) { logger.error(e); return; } } @SuppressWarnings("rawtypes") private static void operation() throws ParseException { if(line.hasOption("ocorpus") && (line.getOptionValue("ocorpus") == null || line.getOptionValue("ocorpus").trim().length() == 0)) { throw new ParseException("Invalid ocorpus option argument value."); } String allDocsCorpusType = line.getOptionValue("oalldocs"); if(line.hasOption("oalldocs") && !allDocsCorpusType.equals("sentence") && !allDocsCorpusType.equals("paragraph")) { throw new ParseException("Invalid oalldocs option argument value."); } String[] operations = line.getOptionValues("operations"); if(operations == null || operations.length == 0) { throw new ParseException("You must specify at least one operation!"); } if(!startTML()) { throw new ParseException("Fatal error initializing TML."); } try { repository = new Repository(repositoryFolder); } catch (Exception e) { logger.error(e); return; } String[] corpusQueries = null; if(line.hasOption("ocorpus")) { corpusQueries = new String[1]; corpusQueries[0] = line.getOptionValue("ocorpus"); } else if(line.hasOption("oalldocs")) { List<TextDocument> docs = null; try { docs = repository.getAllTextDocuments(); } catch (Exception e) { logger.fatal("Couldn't get list of documents from repository."); throw new ParseException(e.getMessage()); } corpusQueries = new String[docs.size()]; for(int i=0;i<docs.size();i++) { TextDocument doc = docs.get(i); String referenceId = null; if(line.getOptionValue("oalldocs").equals("sentence")) referenceId = "p*d" + doc.getExternalId(); else referenceId = doc.getExternalId(); corpusQueries[i] = "type:" + line.getOptionValue("oalldocs") + " AND reference:" + referenceId; } } String corpusLine = "NoCorpus"; if(line.hasOption("ocorpus")) corpusLine = line.getOptionValue("ocorpus").replaceAll("\\W", ""); else if(line.hasOption("oalldocs")) corpusLine = "AllDocuments"; String resultsFilename = repository.getIndexPath().substring(1).replaceAll("[/\\\\]", "_") + "." + corpusLine + "." + (new SimpleDateFormat("yyyy-MM-dd-hh-mm")).format(new Date()) + ".txt"; // Initialize arrays and set default parameters DimensionalityReduction[] dims = new DimensionalityReduction[1]; double[] dimths = new double[1]; boolean lanczos = false; TermSelection[] tsels = new TermSelection[1]; double[] tselths = new double[1]; LocalWeight[] twlocals = new LocalWeight[1]; GlobalWeight[] twglobals = new GlobalWeight[1]; CorpusParameters parameters = new CorpusParameters(); dims[0] = parameters.getDimensionalityReduction(); dimths[0] = parameters.getDimensionalityReductionThreshold(); lanczos = parameters.isLanczosSVD(); tsels[0] = parameters.getTermSelectionCriterion(); tselths[0] = parameters.getTermSelectionThreshold(); twlocals[0] = parameters.getTermWeightLocal(); twglobals[0] = parameters.getTermWeightGlobal(); // If the ocpar option is given, load the parameters file and // override the default parameters if(line.hasOption("ocpar")) { parameters.loadFromFile(new File(line.getOptionValue("ocpar"))); dims[0] = parameters.getDimensionalityReduction(); dimths[0] = parameters.getDimensionalityReductionThreshold(); lanczos = parameters.isLanczosSVD(); tsels[0] = parameters.getTermSelectionCriterion(); tselths[0] = parameters.getTermSelectionThreshold(); twlocals[0] = parameters.getTermWeightLocal(); twglobals[0] = parameters.getTermWeightGlobal(); } else { // Check for every possible parameter if(line.hasOption("odim")) { dims = new DimensionalityReduction[line.getOptionValues("odim").length]; for(int i=0;i<dims.length;i++) dims[i] = DimensionalityReduction.valueOf(line.getOptionValues("odim")[i]); } if(line.hasOption("odimth")) { dimths = new double[line.getOptionValues("odimth").length]; for(int i=0;i<dimths.length;i++) dimths[i] = Double.parseDouble(line.getOptionValues("odimth")[i]); } if(line.hasOption("olanczos")) lanczos = true; else lanczos = false; if(line.hasOption("otsel")) { tsels = new TermSelection[line.getOptionValues("otsel").length]; for(int i=0;i<tsels.length;i++) tsels[i] = TermSelection.valueOf(line.getOptionValues("otsel")[i]); } if(line.hasOption("otselth")) { tselths = new double[line.getOptionValues("otselth").length]; for(int i=0;i<tselths.length;i++) tselths[i] = Double.parseDouble(line.getOptionValues("otselth")[i]); } if(line.hasOption("otwl")) { twlocals = new LocalWeight[line.getOptionValues("otwl").length]; for(int i=0;i<twlocals.length;i++) twlocals[i] = LocalWeight.valueOf(line.getOptionValues("otwl")[i]); } if(line.hasOption("otwg")) { twglobals = new GlobalWeight[line.getOptionValues("otwg").length]; for(int i=0;i<twglobals.length;i++) twglobals[i] = GlobalWeight.valueOf(line.getOptionValues("otwg")[i]); } } String resultsFolder = line.getOptionValue("oresults"); FileWriter writer = null; if(resultsFolder != null) { File resultsFold = new File(resultsFolder); if(resultsFold.exists() && resultsFold.isDirectory()) { try { File results = new File(resultsFolder + "/" + resultsFilename); writer = new FileWriter(results); } catch (IOException e) { logger.error(e); writer = null; } } } // Create the whole combination of parameters for(TermSelection tsel : tsels) for(double tselth : tselths) for(LocalWeight lw : twlocals) for(GlobalWeight gw : twglobals) { CorpusParameters p = new CorpusParameters(); p.setTermSelectionCriterion(tsel); p.setLanczosSVD(lanczos); p.setTermSelectionCriterion(tsel); p.setTermSelectionThreshold(tselth); p.setTermWeightLocal(lw); p.setTermWeightGlobal(gw); logger.debug("Parameters to execute: " + p.toString()); SearchResultsCorpus backgroundKnowledgeCorpus = null; // If we have background knowledge, load it if(line.hasOption("obk")) { backgroundKnowledgeCorpus = new SearchResultsCorpus(line.getOptionValue("obk")); if(line.hasOption("obkpar")) { CorpusParameters bkParameters = new CorpusParameters(); bkParameters.loadFromFile(new File(line.getOptionValue("obkpar"))); backgroundKnowledgeCorpus.setParameters(bkParameters); } try { backgroundKnowledgeCorpus.load(repository); } catch (Exception e) { logger.error("Couldn't load background knowledge corpus."); logger.error(e); e.printStackTrace(); continue; } } // Create the corpus with the query for(String corpusQuery : corpusQueries) { SearchResultsCorpus corpus = new SearchResultsCorpus(corpusQuery); // Loading the corpus try { corpus.setParameters(p); corpus.load(repository); } catch (Exception e) { logger.error("Couldn't load corpus. " + corpus.getLuceneQuery()); logger.error(e); continue; } for(DimensionalityReduction dred : dims) for(double dimth : dimths) { p.setDimensionalityReduction(dred); p.setDimensionalityReductionThreshold(dimth); try { corpus.getParameters().setDimensionalityReduction(dred); corpus.getParameters().setDimensionalityReductionThreshold(dimth); if(backgroundKnowledgeCorpus == null) corpus.getSemanticSpace().calculate(); } catch (Exception e) { logger.error("Couldn't calculate corpus' semantic space"); logger.error(e); e.printStackTrace(); continue; } for(String operation : operations) { Class classDefinition = null; Operation op = null; try { classDefinition = Class.forName("tml.vectorspace.operations." + operation); op = (Operation) classDefinition.newInstance(); } catch (Exception e) { logger.error("The operation wasn't found"); e.printStackTrace(); logger.error(e); continue; } op.setCorpus(corpus); if(backgroundKnowledgeCorpus != null) op.setBackgroundKnowledgeCorpus(backgroundKnowledgeCorpus); try { op.start(); } catch (Exception e) { logger.error("Error while performing the operation"); e.printStackTrace(); logger.error(e); continue; } String backgroundline = "None"; String parametersline = corpus.getParameters().toString(); if(backgroundKnowledgeCorpus != null) { backgroundline = backgroundKnowledgeCorpus.getLuceneQuery() + " [" + backgroundKnowledgeCorpus.getSemanticSpace().getDimensionsKept() + "]"; parametersline = backgroundKnowledgeCorpus.getParameters().toString(); } else { backgroundline += " [" + corpus.getSemanticSpace().getDimensionsKept() + "]"; } String corpusline = "Corpus:" + corpus.getLuceneQuery() + "\n" + "Operation:" + op.getClass().getName() + "\n" + "Background:" + backgroundline + "\n" + "Parameters:" + parametersline + "\n"; if(writer != null) { try { writer.append(corpusline); writer.append(op.getResultsCSVString()); } catch (IOException e) { logger.error("Error writing file " + corpusline); logger.error(e); } } else { System.out.println(corpusline); System.out.println(op.getResultsCSVString()); } }}} } if(writer != null) { try { writer.close(); } catch (IOException e) { logger.error(e); } } } private static boolean startTML() { try { if(line.hasOption("v")) { PropertyConfigurator.configure(Configuration.getTmlProperties(true)); } else { PropertyConfigurator.configure(Configuration.getTmlProperties()); } } catch (IOException e1) { System.out.println("TML jar file is corrupt, please contact the author."); return false; } return true; } }