GenericDataset.java example

Explorer

distiller-CORE-master
- src
  - main
    - java
      - it
        uniud
        ailab
        dcore
        Blackboard.java
        DistilledOutput.java
        Distiller.java
        DistillerException.java
        DistillerFactory.java
        Pipeline.java
        Stage.java
        annotation
        Annotable.java
        Annotation.java
        AnnotationException.java
        Annotator.java
        DefaultAnnotations.java
        annotations
        CoreferenceChainAnnotation.java
        FeatureAnnotation.java
        InferenceAnnotation.java
        NERAnnotation.java
        ScoredAnnotation.java
        TextAnnotation.java
        UriAnnotation.java
        annotators
        ChunkingNerAnnotator.java
        CoreferenceResolverAnnotator.java
        DocumentPhraseMaximalityAnnotator.java
        GenericEvaluatorAnnotator.java
        GenericNGramGeneratorAnnotator.java
        GenericWikipediaAnnotator.java
        GramMergerAnnotator.java
        ItalianLemmatizerAnnotator.java
        LinearEvaluatorAnnotator.java
        PorterStemmerAnnotator.java
        RawTdidfAnnotator.java
        RegexNGramGeneratorAnnotator.java
        SimpleAnnotationFilterAnnotator.java
        SimpleCutFilterAnnotator.java
        SimpleNGramGeneratorAnnotator.java
        SkylineGramFilterAnnotator.java
        StatisticalAnnotator.java
        StopwordSimpleFilterAnnotator.java
        SyuzhetAnnotator.java
        TagMeGramAnnotator.java
        TagMeTokenAnnotator.java
        WikipediaInferenceAnnotator.java
        eval
        Evaluator.java
        GenericDataset.java
        TrainingSetGenerator.java
        datasets
        SemEval2010.java
        kp
        KeyphraseEvaluator15.java
        KeyphraseEvaluatorAll.java
        training
        KeyphraseTrainingSetGenerator.java
        io
        CsvPrinter.java
        FileWriterStage.java
        GenericSheetPrinter.java
        GramPrinter.java
        IOBlackboard.java
        SentencePrinter.java
        TokenPrinter.java
        launchers
        Launcher.java
        SampleInference.java
        SimpleKE.java
        StanfordKE.java
        persistence
        DocumentComponent.java
        DocumentComposite.java
        Gram.java
        Keyphrase.java
        Mention.java
        Sentence.java
        Token.java
        utils
        BlackboardUtils.java
        DocumentUtils.java
        Either.java
        FileSystem.java
        GramUtils.java
        ListUtils.java
        Pair.java
        SnowballStemmerSelector.java
        StageUtils.java
        WikipediaUtils.java
        wrappers
        external
        CybozuLanguageDetectorAnnotator.java
        OpenNlpBootstrapperAnnotator.java
        RCallerEvaluator.java
        StanfordBootstrapperAnnotator.java
        StanfordFastBootstrapperAnnotator.java
  - test
    - java
      - test.java

/*
 * Copyright (C) 2015 Artificial Intelligence
 * Laboratory @ University of Udine.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package it.uniud.ailab.dcore.eval;

import java.util.Comparator;
import java.util.Map;

/**
 * A generic dataset loader. The concrete implementation will take care the duty
 * to load the results into the appropriate structure and provide a Comparator
 * implementation that follows the logic of the dataset.
 * 
 * For example, keyphrase evaluation may be performed both on "simple" keyphrases,
 * or on lemmatized keyphrases, or on stemmed keyphrases.
 * 
 * @author Marco Basaldella
 */
public abstract class GenericDataset implements Comparator<String> {
    
    /**
     * The path where the evaluator will find the input documents and the
     * gold standard results.
     */
    protected final String datasetPath;
    
    /**
     * The training documents. Each document has an identifier and a content.
     */
    private Map<String, String> trainingDocuments;
    /**
     * The expected answers for the test documents. Each list of training answers
     * is paired to the identifier of the corresponding document.
     */
    private Map<String, String[]> trainingAnswers;
    
    /**
     * The test documents. Each document has an identifier and a content.
     */
    private Map<String, String> testDocuments;
    /**
     * The expected answers for the test documents. Each list of test answers
     * is paired to the identifier of the corresponding document.
     */
    private Map<String, String[]> testAnswers;
    
    /**
     * A value that indicates wheter if the documents have already been loaded
     * or not.
     */
    private boolean isLoaded;
    
    /**
     * An output-friendly string that identifies the dataset.
     */
    private final String identifier;
    
    /**
     * Create a concrete dataset that will contain the data contained in the 
     * specified path.
     * 
     * @param datasetPath The folder where the Dataset will look for the document. 
     * @param identifier An output-friendly string that identifies the dataset.
     */
    public GenericDataset(String datasetPath,String identifier) {
        this.datasetPath = datasetPath;
        this.isLoaded = false;
        this.identifier = identifier;
    }
    
    /**
     * Get an output-friendly string that identifies the dataset.
     * 
     * @return an output-friendly string that identifies the dataset.
     */
    public String getIdentifier() {
        return identifier;
    }
    
    /**
     * Compares a <b>candidate</b> item with a <b>dataset provided</b> item. Please 
     * note that the the object to test <b>must</b> be passed as first parameter,
     * while the object to test against <b>must</b> be passed as second 
     * parameter.
     * 
     * @param o1 the object to test, generated by the Distiller
     * @param o2 the reference object, provided by the training set.
     * @return 0 if o1 and o2 are equal, another number (indetermined) otherwise.
     */
    @Override
    public abstract int compare(String o1, String o2);
    
    /**
     * Loads the input documents and returns them.
     *
     * @return the input documents mapped with their identifier.
     */
    protected abstract Map<String, String> loadTestSet(); 

    /**
     * Loads the test set answers and returns them.
     *
     * @return the test set answers mapped with the identifier of the
     * document they belong to.
     */
    protected abstract Map<String, String[]> loadTestAnswers();
    
    /**
     * Loads the training documents and returns them.
     *
     * @return the training documents mapped with their identifier.
     */
    protected abstract Map<String, String> loadTrainingSet(); 

    /**
     * Loads the training set expected answers and returns them.
     *
     * @return the training set answers mapped with the identifier of the
     * document they belong to.
     */
    protected abstract Map<String, String[]> loadTrainingAnswers();

    /**
     * Get the test set documents for the dataset.
     * 
     * @return the test set for the dataset.
     */
    public Map<String, String> getTestSet() {
        return testDocuments;
    }

    /**
     * Get the test set results for the dataset.
     * 
     * @return the expected results on the test set of the dataset.
     */
    public Map<String, String[]> getTestAnswers() {
        return testAnswers;
    }
    
    /**
     * Get the training set documents for the dataset.
     * 
     * @return the training set for the dataset.
     */
    public Map<String, String> getTrainingSet() {
        return trainingDocuments;
    }

    /**
     * Get the training set results for the dataset.
     * 
     * @return the expected results on the training set of the dataset.
     */
    public Map<String, String[]> getTrainingAnswers() {
        return trainingAnswers;
    }
    
    /**
     * Gets the dataset status: true if the data has already been loaded,
     * false otherwise.
     * 
     * @return the dataset status.
     */
    public boolean isLoaded() {
        return isLoaded;
    }
    
    /**
     * Sets the dataset status.
     * 
     * @param isLoaded the dataset status.
     */
    private void setLoaded(boolean isLoaded) {
        this.isLoaded = isLoaded;
    }
    
    /**
     * Load the dataset. 
     */
    public void load() {
        trainingDocuments = loadTrainingSet();
        testDocuments = loadTestSet();
        trainingAnswers = loadTrainingAnswers();
        testAnswers = loadTestAnswers();
        
        setLoaded(true);
    }

    /**
     * Get the folder that contains the training set.
     *
     * @return the folder that contains the training set.
     */
    public abstract String getTrainingFolder();

    /**
     * Get the folder that contains the test set.
     *
     * @return the folder that contains the test set.
     */
    public abstract String getTestFolder();
    
}