Launcher.java example

Explorer

distiller-CORE-master
- src
  - main
    - java
      - it
        uniud
        ailab
        dcore
        Blackboard.java
        DistilledOutput.java
        Distiller.java
        DistillerException.java
        DistillerFactory.java
        Pipeline.java
        Stage.java
        annotation
        Annotable.java
        Annotation.java
        AnnotationException.java
        Annotator.java
        DefaultAnnotations.java
        annotations
        CoreferenceChainAnnotation.java
        FeatureAnnotation.java
        InferenceAnnotation.java
        NERAnnotation.java
        ScoredAnnotation.java
        TextAnnotation.java
        UriAnnotation.java
        annotators
        ChunkingNerAnnotator.java
        CoreferenceResolverAnnotator.java
        DocumentPhraseMaximalityAnnotator.java
        GenericEvaluatorAnnotator.java
        GenericNGramGeneratorAnnotator.java
        GenericWikipediaAnnotator.java
        GramMergerAnnotator.java
        ItalianLemmatizerAnnotator.java
        LinearEvaluatorAnnotator.java
        PorterStemmerAnnotator.java
        RawTdidfAnnotator.java
        RegexNGramGeneratorAnnotator.java
        SimpleAnnotationFilterAnnotator.java
        SimpleCutFilterAnnotator.java
        SimpleNGramGeneratorAnnotator.java
        SkylineGramFilterAnnotator.java
        StatisticalAnnotator.java
        StopwordSimpleFilterAnnotator.java
        SyuzhetAnnotator.java
        TagMeGramAnnotator.java
        TagMeTokenAnnotator.java
        WikipediaInferenceAnnotator.java
        eval
        Evaluator.java
        GenericDataset.java
        TrainingSetGenerator.java
        datasets
        SemEval2010.java
        kp
        KeyphraseEvaluator15.java
        KeyphraseEvaluatorAll.java
        training
        KeyphraseTrainingSetGenerator.java
        io
        CsvPrinter.java
        FileWriterStage.java
        GenericSheetPrinter.java
        GramPrinter.java
        IOBlackboard.java
        SentencePrinter.java
        TokenPrinter.java
        launchers
        Launcher.java
        SampleInference.java
        SimpleKE.java
        StanfordKE.java
        persistence
        DocumentComponent.java
        DocumentComposite.java
        Gram.java
        Keyphrase.java
        Mention.java
        Sentence.java
        Token.java
        utils
        BlackboardUtils.java
        DocumentUtils.java
        Either.java
        FileSystem.java
        GramUtils.java
        ListUtils.java
        Pair.java
        SnowballStemmerSelector.java
        StageUtils.java
        WikipediaUtils.java
        wrappers
        external
        CybozuLanguageDetectorAnnotator.java
        OpenNlpBootstrapperAnnotator.java
        RCallerEvaluator.java
        StanfordBootstrapperAnnotator.java
        StanfordFastBootstrapperAnnotator.java
  - test
    - java
      - test.java

/*
 * Copyright (C) 2015 Artificial Intelligence
 * Laboratory @ University of Udine.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package it.uniud.ailab.dcore.launchers;

import it.uniud.ailab.dcore.Distiller;
import it.uniud.ailab.dcore.DistillerFactory;
import it.uniud.ailab.dcore.eval.GenericDataset;
import it.uniud.ailab.dcore.eval.datasets.SemEval2010;
import it.uniud.ailab.dcore.eval.kp.KeyphraseEvaluatorAll;
import it.uniud.ailab.dcore.eval.training.KeyphraseTrainingSetGenerator;
import it.uniud.ailab.dcore.io.CsvPrinter;
import it.uniud.ailab.dcore.io.GenericSheetPrinter;
import it.uniud.ailab.dcore.io.IOBlackboard;
import it.uniud.ailab.dcore.utils.FileSystem;
import it.uniud.ailab.dcore.utils.Pair;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.List;
import java.util.Locale;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionGroup;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

/**
 * The class is responsible for the usage of the Distiller via command-line.
 * It's able to:
 * <ul>
 * <li>Select a pre-defined pipeline and start it, or</li>
 * <li>Load a custom pipeline</li>
 * <li>Select an input document or folder</li>
 * <li>Process the pipeline over the document or the documents contained in the
 * folder</li>
 * <li>Print the result of the computation.</li>
 *
 * The input files should be saved in UTF-8 or UTF-16 format.
 * </ul>
 *
 * @author Marco Basaldella
 *
 * Add a new KE configuration for linguistic feature calculation, named
 * stanfordKE
 * @modify Giorgia Chiaradia
 */
public class Launcher {

    /**
     * A shared distiller instance.
     */
    private static Distiller distiller;

    private enum Mode {

        DEFAULT,
        EVALUATION,
        TRAINING_GENERATION;
    }

    private static Mode mode = Mode.DEFAULT;

    /**
     * The file or directory to analyze.
     */
    private static File inputPath;

    /**
     * The output directory.
     */
    private static File outputPath;

    /**
     * The path of the pipeline to use.
     */
    private static File configPath;

    /**
     * Which of the default pipelines has been selected by the user.
     */
    private static String defaultConfig = null;

    /**
     * Which of the packaged pipelines has been selected by the user.
     */
    private static String packagedConfig = null;

    /**
     * The command-line options.
     */
    private static final Options options = new Options();

    /**
     * The language to use to distill.
     */
    private static Locale language = null;

    /**
     * Verbose mode flag.
     */
    private static boolean verbose = false;

    /**
     * The dataset that will be used to perform evaluation or training.
     */
    private static String dataset = "";

    /**
     * Starts the Distiller using the specified configuration, analyzing the
     * specified file, writing the output in the specified folder.
     *
     * @param args the command-line parameters.
     */
    public static void main(String[] args) {

        CommandLineParser parser = new DefaultParser();

        createOptions();

        CommandLine cmd;

        try {
            // parse the command line arguments
            cmd = parser.parse(options, args);
        } catch (ParseException exp) {
            // oops, something went wrong
            printError("Error while parsing command line options: "
                    + exp.getLocalizedMessage());
            return;
        }

        // if no options has been selected, just return.
        if (cmd.getOptions().length == 0) {
            printHelp();
            return;
        }

        // read the options.
        if (readOptions(cmd)) {
            // everything's good! proceed
            doWork();
        } else {
            printError("Unexpected error while parsing command line options\n"
                    + "Please contact the developers of the framwork to get "
                    + "additional help.");
            return;
        }
    }

    /**
     * Reads the command line options.
     *
     * @param cmd the command line options.
     * @return true if everything have been parsed right; false otherwise.
     */
    private static boolean readOptions(CommandLine cmd) {

        // if the user wants help, display that and close
        if (cmd.hasOption("h")) {
            printHelp();
            return true;
        }

        // read mode 
        if (cmd.hasOption("e")) {
            mode = Mode.EVALUATION;
            dataset = cmd.getOptionValue("e");
        }

        // read mode 
        if (cmd.hasOption("t")) {
            mode = Mode.TRAINING_GENERATION;
            dataset = cmd.getOptionValue("t");
        }

        // set the input file/dir
        inputPath = null;
        if (cmd.hasOption("f") && cmd.hasOption("d")) {
            printError("You can set either -f or -d options, not both.");
            return false;
        }

        if (cmd.hasOption("f")) {
            inputPath = new File(cmd.getOptionValue("f"));
            if (!inputPath.exists() || !inputPath.isFile()) {
                printError("Invalid path: " + inputPath.getAbsolutePath());
                return false;
            }
        } else if (cmd.hasOption("d")) {
            inputPath = new File(cmd.getOptionValue("d"));
            if (!inputPath.exists() || !inputPath.isDirectory()) {
                printError("Invalid path: " + inputPath.getAbsolutePath());
                return false;
            }
        }
        if (inputPath == null) {
            printError("No input file or directory detected.");
            return false;
        }

        if (cmd.hasOption("o")) {
            outputPath = new File(cmd.getOptionValue("o"));
            if (!outputPath.exists() && !outputPath.mkdir()) {
                printError("Cannot create output directory.");
                return false;
            }
        } else {
            outputPath = new File(System.getProperty("user.dir"));
        }

        int optionCount
                = (cmd.hasOption("c") ? 1 : 0)
                + (cmd.hasOption("cd") ? 1 : 0)
                + (cmd.hasOption("cp") ? 1 : 0);

        if (optionCount > 1) {
            printError("You should specify only one pipeline!");
            return false;
        } else if (optionCount < 1) {
            printError("You should specify a pipeline!");
            return false;
        } else if (cmd.hasOption("c")) {
            configPath = new File(cmd.getOptionValue("c"));
            if (!configPath.exists() || !configPath.isFile()) {
                printError("Invalid path: " + configPath.getAbsolutePath());
                return false;
            }
        } else if (cmd.hasOption("cd")) {
            defaultConfig = cmd.getOptionValue("cd");
        } else if (cmd.hasOption("cp")) {
            packagedConfig = cmd.getOptionValue("cp");
        }

        if (cmd.hasOption("v")) {
            verbose = true;
        }

        if (cmd.hasOption("l")) {
            language = new Locale(cmd.getOptionValue("l"));
        }

        return true;
    }

    /**
     * Generates the command-line options.
     */
    private static void createOptions() {
        // help message
        options.addOption(Option.builder("h")
                .longOpt("help")
                .desc("Display this message")
                .hasArg(false)
                .build()
        );

        // work modes: evaluation
        options.addOption(Option.builder("e")
                .longOpt("evaluate")
                .desc("Evaluate the pipeline using the DATASET dataset")
                .hasArg(true)
                .argName("DATASET")
                .build()
        );

        // work modes: training
        options.addOption(Option.builder("t")
                .longOpt("training-generation")
                .desc("Generate a training set for machine learning "
                        + "using the DATASET dataset")
                .hasArg(true)
                .argName("DATASET")
                .build()
        );

        // load the pipeline
        options.addOption(Option.builder("c")
                .longOpt("config-file")
                .desc("Use the configuration located in PATH")
                .hasArg(true)
                .argName("FILE")
                .build()
        );

        // load the pipeline-2
        options.addOption(Option.builder("cd")
                .longOpt("config-default")
                .desc("Use one of the default configurations (deprecated)")
                .hasArg(true)
                .argName("PIPELINE")
                .build()
        );

        // load the pipeline-3
        options.addOption(Option.builder("cp")
                .longOpt("config-packaged")
                .desc("Use one of the pre-packaged configurations")
                .hasArg(true)
                .argName("PIPELINE")
                .build()
        );

        OptionGroup inputGroup = new OptionGroup();
        //inputGroup.setRequired(true);

        // load the input file
        inputGroup.addOption(Option.builder("f")
                .longOpt("file")
                .desc("Analyze the input file FILE")
                .hasArg(true)
                .argName("FILE")
                .build()
        );

        // load the input directory
        inputGroup.addOption(Option.builder("d")
                .longOpt("dir")
                .desc("Analyze all files contained in DIR (not recursive)")
                .hasArg(true)
                .argName("DIR")
                .build()
        );

        options.addOptionGroup(inputGroup);

        // set the output file prefix
        options.addOption(Option.builder("o")
                .longOpt("output-folder")
                .desc("Write the output in PATH")
                .hasArg(true)
                .argName("PATH")
                .build()
        );

        // verbose distillation
        options.addOption(Option.builder("v")
                .longOpt("verbose")
                .desc("Print details while extracting")
                .hasArg(false)
                .build());

        options.addOption(Option.builder("l")
                .longOpt("language")
                .desc("LANGUAGE of the input document (optional)")
                .hasArg(true)
                .argName("LANGUAGE")
                .build()
        );
    }

    /**
     * Displays an error message followed by the instructions on how to use the
     * Launcher.
     *
     * @param message the error message.
     */
    private static void printError(String message) {
        System.out.println("Error: " + message);
        System.out.println();
        printHelp();
    }

    /**
     * Displays the instructions.
     */
    private static void printHelp() {

        System.out.println("Distiller-CORE library - http://ailab.uniud.it");
        System.out.println();

        HelpFormatter formatter = new HelpFormatter();

        formatter.printHelp("dcore-"
                + Launcher.class.getPackage().getImplementationVersion()
                + ".jar", options);
    }

    /**
     * Decide what Distillation (single or directory) execute and run it.
     */
    private static void doWork() {

        switch (mode) {

            case EVALUATION:
                evaluate();
                break;
            case TRAINING_GENERATION:
                generateTrainingSet();
                break;
            default:
                try {
                    if (inputPath.isFile()) {
                        analyzeFile(inputPath);
                    } else {
                        analyzeDir(inputPath);
                    }
                } catch (IOException ioe) {
                    System.err.println(ioe.getLocalizedMessage());
                    System.err.println(ioe.toString());
                }
        }

    }

    /**
     * Performs the evaluation of a Distiller pipeline on the specified dataset,
     * deferring the work to the appropriate class.
     */
    private static void evaluate() {

        System.out.println("Launching evaluation...");

        if (!inputPath.isDirectory()) {
            System.err.println(
                    "You should set the folder containing the evaluation files as input.");
        }

        setupDistiller();

        GenericDataset kpDataset;

        switch (dataset) {
            case "semeval":
                kpDataset = new SemEval2010(inputPath.getAbsolutePath());
                break;
            default:
                kpDataset = null;
        }

        if (kpDataset == null) {
            throw new UnsupportedOperationException(
                    "Unknown dataset:" + dataset);
        }

        (new KeyphraseEvaluatorAll(kpDataset)).
                evaluate(distiller);

    }

    /**
     * Generates the training set of a Distiller pipeline on the specified
     * dataset, deferring the work to the appropriate class.
     */
    private static void generateTrainingSet() {

        System.out.println("Launching training set generation...");

        if (!inputPath.isDirectory()) {
            printError(
                    "You should set the folder containing the dataset files as input.");
            return;
        }

        if (outputPath == null || !outputPath.isDirectory()) {
            printError(
                    "You should set an output folder for the training set files.");
        }

        setupDistiller();

        GenericDataset kpDataset;

        switch (dataset) {
            case "semeval":
                kpDataset = new SemEval2010(inputPath.getAbsolutePath());
                break;
            default:
                kpDataset = null;
        }

        if (kpDataset == null) {
            throw new UnsupportedOperationException(
                    "Unknown dataset:" + dataset);
        }

        KeyphraseTrainingSetGenerator trainingGenerator
                = new KeyphraseTrainingSetGenerator(kpDataset);

        IOBlackboard.setDocumentsFolder(kpDataset.getTrainingFolder());

        List<Pair<String, GenericSheetPrinter>> trainingDocuments
                = trainingGenerator.generateTrainingSet(distiller);

        GenericSheetPrinter trainingSet = new CsvPrinter(CsvPrinter.DEFAULT_DELIMITER, true, true);

        for (Pair<String, GenericSheetPrinter> tr : trainingDocuments) {

            GenericSheetPrinter p = tr.getRight();
            trainingSet.addPrinter(p);

        }

        String filePath = outputPath.getAbsolutePath()
                + FileSystem.getSeparator()
                + dataset + ".training.txt";
        trainingSet.writeFile(filePath);
        System.out.println(
                "Saved training file in " + filePath);

        IOBlackboard.setDocumentsFolder(kpDataset.getTestFolder());

        List<Pair<String, GenericSheetPrinter>> testDocuments
                = trainingGenerator.generateTestSet(distiller);

        GenericSheetPrinter testSet = new CsvPrinter(CsvPrinter.DEFAULT_DELIMITER, true, true);

        for (Pair<String, GenericSheetPrinter> tr : testDocuments) {

            GenericSheetPrinter p = tr.getRight();
            testSet.addPrinter(p);

        }

        filePath = outputPath.getAbsolutePath()
                + FileSystem.getSeparator()
                + dataset + ".test.txt";

        testSet.writeFile(filePath);
        System.out.println(
                "Saved training file in " + filePath);

    }

    /**
     * Distill the content of a file.
     *
     * @param filePath the file to analyze.
     *
     * @throws IOException if there's an error reading the file.
     */
    private static void analyzeFile(File filePath) throws IOException {

        setupDistiller();

        String fileName = filePath.toPath().getFileName().toString();

        IOBlackboard.setCurrentDocument(filePath.getAbsolutePath());
        String document = loadDocument(filePath);

        IOBlackboard.setOutputPathPrefix(outputPath.getAbsolutePath()
                + FileSystem.getSeparator()
                + fileName);

        distiller.distill(document);

    }

    /**
     * Load the document trying different charsets. The charset tried, are, in
     * order:
     * <ul>
     * <li>UTF-16;</li>
     * <li>UTF-8;</li>
     * <li>US-ASCII.</li>
     * </ul>
     *
     * @param filePath the path of the document
     * @return the text of the document
     * @throws IOException if the charset is not supported
     */
    private static String loadDocument(File filePath) throws IOException {

        String document = "";

        IOException exception = null;
        // try different charsets. if none is recognized, throw the
        // exception detected when reading.
        try {
            document = String.join(" ", Files.readAllLines(
                    filePath.toPath(), StandardCharsets.UTF_8));

        } catch (java.nio.charset.MalformedInputException e) {
            exception = e;
        }

        if (exception != null) {
            try {
                exception = null;
                document = String.join(" ", Files.readAllLines(
                        filePath.toPath(), StandardCharsets.UTF_16));

            } catch (java.nio.charset.MalformedInputException e) {
                exception = e;
            }
        }

        if (exception != null) {
            try {
                exception = null;
                document = String.join(" ", Files.readAllLines(
                        filePath.toPath(), StandardCharsets.US_ASCII));

            } catch (java.nio.charset.MalformedInputException e) {
                exception = e;
            }
        }

        // no charset has been recognized
        if (exception != null) {
            throw exception;
        }
        return document;
    }

    /**
     * Distill the content of a directory.
     *
     * @param inputPath the directory analyze.
     *
     * @throws IOException if there's an error reading the file.
     */
    private static void analyzeDir(File inputPath) throws IOException {
        File folderPath = inputPath;

        IOBlackboard.setDocumentsFolder(inputPath.getAbsolutePath());

        for (File f : folderPath.listFiles()) {

            System.out.println("Analyzing " + f.getAbsolutePath() + "...");
            analyzeFile(f);

        }

    }

    /**
     * Configures the shared Distiller instance.
     */
    private static void setupDistiller() {
        distiller = null;

        if (defaultConfig == null && packagedConfig == null) {
            distiller = DistillerFactory.loadFromXML(configPath);
        } else if (defaultConfig == null) {
            distiller = DistillerFactory.loadFromPackagedXML(
                    "pipelines"
                    + FileSystem.getSeparator()
                    + packagedConfig
                    + ".xml");
        } else if (defaultConfig.equals("simpleKE")) {
            distiller = DistillerFactory.getDefaultCode();
            //use this configuration to use stanford coreNLP parser and add linguistis
            //features to distiller
        } else if (defaultConfig.equals("stanfordKE")) {
            distiller = DistillerFactory.getStanfordCode();

        }// add other default pipelines HERE
        // please remeber to document the new pipeline in the help message
        // that is printed below 
        else {

            System.out.println("Unrecognized configuration. Supported parameters:");
            System.out.println("- simpleKE : simple, offline keyphrase extraction");
            System.out.println();

            printError("Please select a valid configuration.");
            return;
        }

        if (language != null) {
            distiller.setLocale(language);
        }

        distiller.setVerbose(verbose);
    }
}