/*
* Copyright (C) 2015 Artificial Intelligence
* Laboratory @ University of Udine.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package it.uniud.ailab.dcore.launchers;
import it.uniud.ailab.dcore.Distiller;
import it.uniud.ailab.dcore.DistillerFactory;
import it.uniud.ailab.dcore.eval.GenericDataset;
import it.uniud.ailab.dcore.eval.datasets.SemEval2010;
import it.uniud.ailab.dcore.eval.kp.KeyphraseEvaluatorAll;
import it.uniud.ailab.dcore.eval.training.KeyphraseTrainingSetGenerator;
import it.uniud.ailab.dcore.io.CsvPrinter;
import it.uniud.ailab.dcore.io.GenericSheetPrinter;
import it.uniud.ailab.dcore.io.IOBlackboard;
import it.uniud.ailab.dcore.utils.FileSystem;
import it.uniud.ailab.dcore.utils.Pair;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.List;
import java.util.Locale;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionGroup;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
/**
* The class is responsible for the usage of the Distiller via command-line.
* It's able to:
* <ul>
* <li>Select a pre-defined pipeline and start it, or</li>
* <li>Load a custom pipeline</li>
* <li>Select an input document or folder</li>
* <li>Process the pipeline over the document or the documents contained in the
* folder</li>
* <li>Print the result of the computation.</li>
*
* The input files should be saved in UTF-8 or UTF-16 format.
* </ul>
*
* @author Marco Basaldella
*
* Add a new KE configuration for linguistic feature calculation, named
* stanfordKE
* @modify Giorgia Chiaradia
*/
public class Launcher {
/**
* A shared distiller instance.
*/
private static Distiller distiller;
private enum Mode {
DEFAULT,
EVALUATION,
TRAINING_GENERATION;
}
private static Mode mode = Mode.DEFAULT;
/**
* The file or directory to analyze.
*/
private static File inputPath;
/**
* The output directory.
*/
private static File outputPath;
/**
* The path of the pipeline to use.
*/
private static File configPath;
/**
* Which of the default pipelines has been selected by the user.
*/
private static String defaultConfig = null;
/**
* Which of the packaged pipelines has been selected by the user.
*/
private static String packagedConfig = null;
/**
* The command-line options.
*/
private static final Options options = new Options();
/**
* The language to use to distill.
*/
private static Locale language = null;
/**
* Verbose mode flag.
*/
private static boolean verbose = false;
/**
* The dataset that will be used to perform evaluation or training.
*/
private static String dataset = "";
/**
* Starts the Distiller using the specified configuration, analyzing the
* specified file, writing the output in the specified folder.
*
* @param args the command-line parameters.
*/
public static void main(String[] args) {
CommandLineParser parser = new DefaultParser();
createOptions();
CommandLine cmd;
try {
// parse the command line arguments
cmd = parser.parse(options, args);
} catch (ParseException exp) {
// oops, something went wrong
printError("Error while parsing command line options: "
+ exp.getLocalizedMessage());
return;
}
// if no options has been selected, just return.
if (cmd.getOptions().length == 0) {
printHelp();
return;
}
// read the options.
if (readOptions(cmd)) {
// everything's good! proceed
doWork();
} else {
printError("Unexpected error while parsing command line options\n"
+ "Please contact the developers of the framwork to get "
+ "additional help.");
return;
}
}
/**
* Reads the command line options.
*
* @param cmd the command line options.
* @return true if everything have been parsed right; false otherwise.
*/
private static boolean readOptions(CommandLine cmd) {
// if the user wants help, display that and close
if (cmd.hasOption("h")) {
printHelp();
return true;
}
// read mode
if (cmd.hasOption("e")) {
mode = Mode.EVALUATION;
dataset = cmd.getOptionValue("e");
}
// read mode
if (cmd.hasOption("t")) {
mode = Mode.TRAINING_GENERATION;
dataset = cmd.getOptionValue("t");
}
// set the input file/dir
inputPath = null;
if (cmd.hasOption("f") && cmd.hasOption("d")) {
printError("You can set either -f or -d options, not both.");
return false;
}
if (cmd.hasOption("f")) {
inputPath = new File(cmd.getOptionValue("f"));
if (!inputPath.exists() || !inputPath.isFile()) {
printError("Invalid path: " + inputPath.getAbsolutePath());
return false;
}
} else if (cmd.hasOption("d")) {
inputPath = new File(cmd.getOptionValue("d"));
if (!inputPath.exists() || !inputPath.isDirectory()) {
printError("Invalid path: " + inputPath.getAbsolutePath());
return false;
}
}
if (inputPath == null) {
printError("No input file or directory detected.");
return false;
}
if (cmd.hasOption("o")) {
outputPath = new File(cmd.getOptionValue("o"));
if (!outputPath.exists() && !outputPath.mkdir()) {
printError("Cannot create output directory.");
return false;
}
} else {
outputPath = new File(System.getProperty("user.dir"));
}
int optionCount
= (cmd.hasOption("c") ? 1 : 0)
+ (cmd.hasOption("cd") ? 1 : 0)
+ (cmd.hasOption("cp") ? 1 : 0);
if (optionCount > 1) {
printError("You should specify only one pipeline!");
return false;
} else if (optionCount < 1) {
printError("You should specify a pipeline!");
return false;
} else if (cmd.hasOption("c")) {
configPath = new File(cmd.getOptionValue("c"));
if (!configPath.exists() || !configPath.isFile()) {
printError("Invalid path: " + configPath.getAbsolutePath());
return false;
}
} else if (cmd.hasOption("cd")) {
defaultConfig = cmd.getOptionValue("cd");
} else if (cmd.hasOption("cp")) {
packagedConfig = cmd.getOptionValue("cp");
}
if (cmd.hasOption("v")) {
verbose = true;
}
if (cmd.hasOption("l")) {
language = new Locale(cmd.getOptionValue("l"));
}
return true;
}
/**
* Generates the command-line options.
*/
private static void createOptions() {
// help message
options.addOption(Option.builder("h")
.longOpt("help")
.desc("Display this message")
.hasArg(false)
.build()
);
// work modes: evaluation
options.addOption(Option.builder("e")
.longOpt("evaluate")
.desc("Evaluate the pipeline using the DATASET dataset")
.hasArg(true)
.argName("DATASET")
.build()
);
// work modes: training
options.addOption(Option.builder("t")
.longOpt("training-generation")
.desc("Generate a training set for machine learning "
+ "using the DATASET dataset")
.hasArg(true)
.argName("DATASET")
.build()
);
// load the pipeline
options.addOption(Option.builder("c")
.longOpt("config-file")
.desc("Use the configuration located in PATH")
.hasArg(true)
.argName("FILE")
.build()
);
// load the pipeline-2
options.addOption(Option.builder("cd")
.longOpt("config-default")
.desc("Use one of the default configurations (deprecated)")
.hasArg(true)
.argName("PIPELINE")
.build()
);
// load the pipeline-3
options.addOption(Option.builder("cp")
.longOpt("config-packaged")
.desc("Use one of the pre-packaged configurations")
.hasArg(true)
.argName("PIPELINE")
.build()
);
OptionGroup inputGroup = new OptionGroup();
//inputGroup.setRequired(true);
// load the input file
inputGroup.addOption(Option.builder("f")
.longOpt("file")
.desc("Analyze the input file FILE")
.hasArg(true)
.argName("FILE")
.build()
);
// load the input directory
inputGroup.addOption(Option.builder("d")
.longOpt("dir")
.desc("Analyze all files contained in DIR (not recursive)")
.hasArg(true)
.argName("DIR")
.build()
);
options.addOptionGroup(inputGroup);
// set the output file prefix
options.addOption(Option.builder("o")
.longOpt("output-folder")
.desc("Write the output in PATH")
.hasArg(true)
.argName("PATH")
.build()
);
// verbose distillation
options.addOption(Option.builder("v")
.longOpt("verbose")
.desc("Print details while extracting")
.hasArg(false)
.build());
options.addOption(Option.builder("l")
.longOpt("language")
.desc("LANGUAGE of the input document (optional)")
.hasArg(true)
.argName("LANGUAGE")
.build()
);
}
/**
* Displays an error message followed by the instructions on how to use the
* Launcher.
*
* @param message the error message.
*/
private static void printError(String message) {
System.out.println("Error: " + message);
System.out.println();
printHelp();
}
/**
* Displays the instructions.
*/
private static void printHelp() {
System.out.println("Distiller-CORE library - http://ailab.uniud.it");
System.out.println();
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("dcore-"
+ Launcher.class.getPackage().getImplementationVersion()
+ ".jar", options);
}
/**
* Decide what Distillation (single or directory) execute and run it.
*/
private static void doWork() {
switch (mode) {
case EVALUATION:
evaluate();
break;
case TRAINING_GENERATION:
generateTrainingSet();
break;
default:
try {
if (inputPath.isFile()) {
analyzeFile(inputPath);
} else {
analyzeDir(inputPath);
}
} catch (IOException ioe) {
System.err.println(ioe.getLocalizedMessage());
System.err.println(ioe.toString());
}
}
}
/**
* Performs the evaluation of a Distiller pipeline on the specified dataset,
* deferring the work to the appropriate class.
*/
private static void evaluate() {
System.out.println("Launching evaluation...");
if (!inputPath.isDirectory()) {
System.err.println(
"You should set the folder containing the evaluation files as input.");
}
setupDistiller();
GenericDataset kpDataset;
switch (dataset) {
case "semeval":
kpDataset = new SemEval2010(inputPath.getAbsolutePath());
break;
default:
kpDataset = null;
}
if (kpDataset == null) {
throw new UnsupportedOperationException(
"Unknown dataset:" + dataset);
}
(new KeyphraseEvaluatorAll(kpDataset)).
evaluate(distiller);
}
/**
* Generates the training set of a Distiller pipeline on the specified
* dataset, deferring the work to the appropriate class.
*/
private static void generateTrainingSet() {
System.out.println("Launching training set generation...");
if (!inputPath.isDirectory()) {
printError(
"You should set the folder containing the dataset files as input.");
return;
}
if (outputPath == null || !outputPath.isDirectory()) {
printError(
"You should set an output folder for the training set files.");
}
setupDistiller();
GenericDataset kpDataset;
switch (dataset) {
case "semeval":
kpDataset = new SemEval2010(inputPath.getAbsolutePath());
break;
default:
kpDataset = null;
}
if (kpDataset == null) {
throw new UnsupportedOperationException(
"Unknown dataset:" + dataset);
}
KeyphraseTrainingSetGenerator trainingGenerator
= new KeyphraseTrainingSetGenerator(kpDataset);
IOBlackboard.setDocumentsFolder(kpDataset.getTrainingFolder());
List<Pair<String, GenericSheetPrinter>> trainingDocuments
= trainingGenerator.generateTrainingSet(distiller);
GenericSheetPrinter trainingSet = new CsvPrinter(CsvPrinter.DEFAULT_DELIMITER, true, true);
for (Pair<String, GenericSheetPrinter> tr : trainingDocuments) {
GenericSheetPrinter p = tr.getRight();
trainingSet.addPrinter(p);
}
String filePath = outputPath.getAbsolutePath()
+ FileSystem.getSeparator()
+ dataset + ".training.txt";
trainingSet.writeFile(filePath);
System.out.println(
"Saved training file in " + filePath);
IOBlackboard.setDocumentsFolder(kpDataset.getTestFolder());
List<Pair<String, GenericSheetPrinter>> testDocuments
= trainingGenerator.generateTestSet(distiller);
GenericSheetPrinter testSet = new CsvPrinter(CsvPrinter.DEFAULT_DELIMITER, true, true);
for (Pair<String, GenericSheetPrinter> tr : testDocuments) {
GenericSheetPrinter p = tr.getRight();
testSet.addPrinter(p);
}
filePath = outputPath.getAbsolutePath()
+ FileSystem.getSeparator()
+ dataset + ".test.txt";
testSet.writeFile(filePath);
System.out.println(
"Saved training file in " + filePath);
}
/**
* Distill the content of a file.
*
* @param filePath the file to analyze.
*
* @throws IOException if there's an error reading the file.
*/
private static void analyzeFile(File filePath) throws IOException {
setupDistiller();
String fileName = filePath.toPath().getFileName().toString();
IOBlackboard.setCurrentDocument(filePath.getAbsolutePath());
String document = loadDocument(filePath);
IOBlackboard.setOutputPathPrefix(outputPath.getAbsolutePath()
+ FileSystem.getSeparator()
+ fileName);
distiller.distill(document);
}
/**
* Load the document trying different charsets. The charset tried, are, in
* order:
* <ul>
* <li>UTF-16;</li>
* <li>UTF-8;</li>
* <li>US-ASCII.</li>
* </ul>
*
* @param filePath the path of the document
* @return the text of the document
* @throws IOException if the charset is not supported
*/
private static String loadDocument(File filePath) throws IOException {
String document = "";
IOException exception = null;
// try different charsets. if none is recognized, throw the
// exception detected when reading.
try {
document = String.join(" ", Files.readAllLines(
filePath.toPath(), StandardCharsets.UTF_8));
} catch (java.nio.charset.MalformedInputException e) {
exception = e;
}
if (exception != null) {
try {
exception = null;
document = String.join(" ", Files.readAllLines(
filePath.toPath(), StandardCharsets.UTF_16));
} catch (java.nio.charset.MalformedInputException e) {
exception = e;
}
}
if (exception != null) {
try {
exception = null;
document = String.join(" ", Files.readAllLines(
filePath.toPath(), StandardCharsets.US_ASCII));
} catch (java.nio.charset.MalformedInputException e) {
exception = e;
}
}
// no charset has been recognized
if (exception != null) {
throw exception;
}
return document;
}
/**
* Distill the content of a directory.
*
* @param inputPath the directory analyze.
*
* @throws IOException if there's an error reading the file.
*/
private static void analyzeDir(File inputPath) throws IOException {
File folderPath = inputPath;
IOBlackboard.setDocumentsFolder(inputPath.getAbsolutePath());
for (File f : folderPath.listFiles()) {
System.out.println("Analyzing " + f.getAbsolutePath() + "...");
analyzeFile(f);
}
}
/**
* Configures the shared Distiller instance.
*/
private static void setupDistiller() {
distiller = null;
if (defaultConfig == null && packagedConfig == null) {
distiller = DistillerFactory.loadFromXML(configPath);
} else if (defaultConfig == null) {
distiller = DistillerFactory.loadFromPackagedXML(
"pipelines"
+ FileSystem.getSeparator()
+ packagedConfig
+ ".xml");
} else if (defaultConfig.equals("simpleKE")) {
distiller = DistillerFactory.getDefaultCode();
//use this configuration to use stanford coreNLP parser and add linguistis
//features to distiller
} else if (defaultConfig.equals("stanfordKE")) {
distiller = DistillerFactory.getStanfordCode();
}// add other default pipelines HERE
// please remeber to document the new pipeline in the help message
// that is printed below
else {
System.out.println("Unrecognized configuration. Supported parameters:");
System.out.println("- simpleKE : simple, offline keyphrase extraction");
System.out.println();
printError("Please select a valid configuration.");
return;
}
if (language != null) {
distiller.setLocale(language);
}
distiller.setVerbose(verbose);
}
}