Distiller.java example

Explorer

distiller-CORE-master
- src
  - main
    - java
      - it
        uniud
        ailab
        dcore
        Blackboard.java
        DistilledOutput.java
        Distiller.java
        DistillerException.java
        DistillerFactory.java
        Pipeline.java
        Stage.java
        annotation
        Annotable.java
        Annotation.java
        AnnotationException.java
        Annotator.java
        DefaultAnnotations.java
        annotations
        CoreferenceChainAnnotation.java
        FeatureAnnotation.java
        InferenceAnnotation.java
        NERAnnotation.java
        ScoredAnnotation.java
        TextAnnotation.java
        UriAnnotation.java
        annotators
        ChunkingNerAnnotator.java
        CoreferenceResolverAnnotator.java
        DocumentPhraseMaximalityAnnotator.java
        GenericEvaluatorAnnotator.java
        GenericNGramGeneratorAnnotator.java
        GenericWikipediaAnnotator.java
        GramMergerAnnotator.java
        ItalianLemmatizerAnnotator.java
        LinearEvaluatorAnnotator.java
        PorterStemmerAnnotator.java
        RawTdidfAnnotator.java
        RegexNGramGeneratorAnnotator.java
        SimpleAnnotationFilterAnnotator.java
        SimpleCutFilterAnnotator.java
        SimpleNGramGeneratorAnnotator.java
        SkylineGramFilterAnnotator.java
        StatisticalAnnotator.java
        StopwordSimpleFilterAnnotator.java
        SyuzhetAnnotator.java
        TagMeGramAnnotator.java
        TagMeTokenAnnotator.java
        WikipediaInferenceAnnotator.java
        eval
        Evaluator.java
        GenericDataset.java
        TrainingSetGenerator.java
        datasets
        SemEval2010.java
        kp
        KeyphraseEvaluator15.java
        KeyphraseEvaluatorAll.java
        training
        KeyphraseTrainingSetGenerator.java
        io
        CsvPrinter.java
        FileWriterStage.java
        GenericSheetPrinter.java
        GramPrinter.java
        IOBlackboard.java
        SentencePrinter.java
        TokenPrinter.java
        launchers
        Launcher.java
        SampleInference.java
        SimpleKE.java
        StanfordKE.java
        persistence
        DocumentComponent.java
        DocumentComposite.java
        Gram.java
        Keyphrase.java
        Mention.java
        Sentence.java
        Token.java
        utils
        BlackboardUtils.java
        DocumentUtils.java
        Either.java
        FileSystem.java
        GramUtils.java
        ListUtils.java
        Pair.java
        SnowballStemmerSelector.java
        StageUtils.java
        WikipediaUtils.java
        wrappers
        external
        CybozuLanguageDetectorAnnotator.java
        OpenNlpBootstrapperAnnotator.java
        RCallerEvaluator.java
        StanfordBootstrapperAnnotator.java
        StanfordFastBootstrapperAnnotator.java
  - test
    - java
      - test.java

/*
 * Copyright (C) 2015 Artificial Intelligence
 * Laboratory @ University of Udine.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package it.uniud.ailab.dcore;

import it.uniud.ailab.dcore.DistilledOutput.DetectedGram;
import it.uniud.ailab.dcore.DistilledOutput.InferredConcept;
import it.uniud.ailab.dcore.annotation.Annotator;
import it.uniud.ailab.dcore.annotation.annotations.InferenceAnnotation;
import it.uniud.ailab.dcore.annotation.annotations.UriAnnotation;
import it.uniud.ailab.dcore.annotation.annotators.GenericEvaluatorAnnotator;
import static it.uniud.ailab.dcore.annotation.annotators.GenericWikipediaAnnotator.WIKIURI;
import it.uniud.ailab.dcore.annotation.annotators.WikipediaInferenceAnnotator;
import it.uniud.ailab.dcore.persistence.Gram;
import it.uniud.ailab.dcore.persistence.Keyphrase;
import static it.uniud.ailab.dcore.utils.StageUtils.getStageName;
import java.util.Collection;
import java.util.Collections;
import java.util.Locale;
import org.springframework.beans.factory.annotation.Required;


import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * The information extractor object. This is the class that runs the different
 * annotation pipelines.
 * 
 * @author Marco Basaldella
 * @author Dario De Nart
 */
public class Distiller {

    // all these fields will be injected via setter method
    /**
     * The first step of the actual pipeline: this annotator should decide in
     * which language the document is written.
     */
    private Annotator languageDetector;

    /**
     * The annotation pipelines. There should be one for language.
     */
    private Map<Locale, Pipeline> pipelines = new HashMap<>();

    /**
     * The language of the document.
     */
    private Locale documentLocale = null;

    /**
     * The Object that will contain the text all its annotations.
     */
    private Blackboard blackboard;
    
    /**
     * The verbose mode flag. If the flag is set to true, Distiller will
     * print information on the work he's doing on stdout. Default is false.
     */
    private boolean verbose = false;

    /**
     * Sets the language detector.
     *
     * @param languageDetector the language detector.
     */
    @Required
    public void setLanguageDetector(Annotator languageDetector) {
        this.languageDetector = languageDetector;
    }

    /**
     * Set the annotation pipelines.
     *
     * @param pipelines the annotation pipelines.
     */
    @Required
    public void setPipelines(HashMap<Locale, Pipeline> pipelines) {
        this.pipelines = pipelines;
    }

    /**
     * Adds a pipeline to the pipelines map.
     *
     * @param locale the language that the pipeline will process
     * @param pipeline the pipeline to add
     */
    public void addPipeline(Locale locale, Pipeline pipeline) {
        pipelines.put(locale, pipeline);
    }

    /**
     * Sets the locale in which the text extraction will be performed. The value
     * should be null for auto-detection of the locale of the IETF formatted
     * language tag if manual locale setting is desired. For example, passing
     * "en-US" will set the locale to English. An empty locale equals to the
     * "auto" parameter.
     *
     * @param locale the locale to use while processing the text.
     */
    public void setLocale(Locale locale) throws IllegalArgumentException {
        this.documentLocale = locale;
    }

    /**
     * Gets the blackboard.
     * 
     * @return the blackboard.
     */
    public Blackboard getBlackboard() {
        return blackboard;
    }

    /**
     * Sets the verbose mode of the Distiller.
     * 
     * @param verbose true to display information of the distillation process;
     * false for silent distillation.
     */
    public void setVerbose(boolean verbose) {
        this.verbose = verbose;
    }

    /**
     * Gets the verbose mode of the Distiller.
     * @return TRUE if the Distillation is set to VERBOSE mode.
     */
    public boolean  getVerbose() {
        return verbose;
    }
    
    
    /**
     * Perform the extraction of keyphrases of a specified string, and returns
     * the blackboard filled with document and annotations.
     *
     * @param text the text to distill.
     * @return the blackboard filled with the processed text
     */
    public Blackboard distillToBlackboard(String text) {

        blackboard = new Blackboard();
        blackboard.createDocument(text);

        if (documentLocale == null) // if no language has been set, automatically detect it.
        {
            if (languageDetector != null) {
                languageDetector.annotate(blackboard, blackboard.getStructure());
            } else // but if there's no language and no language detector, 
            // throw an exception.
            {
                throw new DistillerException(
                        "I can't decide the language of the document: no language is specified and no language detector is set.");
            }
        } else // set the pre-determined language
        {
            blackboard.getStructure().setLanguage(documentLocale);
        }

        Pipeline pipeline = pipelines.get(blackboard.getStructure().getLanguage());

        if (pipeline == null) {
            throw new DistillerException("No pipeline for the language "
                    + blackboard.getStructure().getLanguage().getLanguage());
        }

        for (Stage stage : pipeline.getStages()) {
            
            if (verbose) {
                System.out.println(String.format("Running %s...",
                        getStageName(stage)));
            }
            
            stage.run(blackboard);
        }
        
        if (verbose) {
            System.out.println("Extraction complete!");
            System.out.println();
        }

        return blackboard;
    }

    /**
     * Perform the extraction of keyphrases of a specified string, and returns a
     * developer-friendly object that allows quick access to the extracted
     * information.
     *
     * @param text the text to extract
     * @return the distilled output
     */
    public DistilledOutput distill(String text) {

        DistilledOutput output = new DistilledOutput();

        output.setOriginalText(text);

        distillToBlackboard(text);

        output.setDetectedLanguage(blackboard.getStructure().
                getLanguage().getLanguage());

        // Copy the grams, sorted by descending score
        output.initializeGrams(blackboard.getKeyphrases().size());
        
        Collection<Gram> grams = blackboard.getKeyphrases();
        Map<Keyphrase, Double> scoredGrams = new HashMap<>();

        for (Gram g : grams) {
            Keyphrase k = (Keyphrase)g;
            scoredGrams.put(k, k.getFeature(GenericEvaluatorAnnotator.SCORE));
        }

        List<Map.Entry<Keyphrase, Double>> sortedGrams
                = scoredGrams.entrySet().stream().sorted(
                        Collections.reverseOrder(Map.Entry.comparingByValue()))
                .collect(Collectors.toList());

        for (int i = 0; i < output.getGrams().length; i++) {
            DetectedGram gram = output.getGrams()[i];
            Keyphrase originalGram = sortedGrams.get(i).getKey();
            gram.setSurface(originalGram.getSurface());
            gram.setKeyphraseness(originalGram.getFeature(
                    it.uniud.ailab.dcore.annotation.annotators.GenericEvaluatorAnnotator.SCORE));

            UriAnnotation wikiAnn = (UriAnnotation) originalGram.getAnnotation(WIKIURI);
            if (wikiAnn != null) {
                gram.setConceptName(wikiAnn.getSurface());
                gram.setConceptPath(wikiAnn.getUri().toASCIIString());
            }
        }

        output.initializeRelatedConcepts(blackboard.getAnnotations(
                WikipediaInferenceAnnotator.RELATED).size());

        for (int i = 0; i < output.getRelatedConcepts().length; i++) {
            InferredConcept related = output.getRelatedConcepts()[i];
            InferenceAnnotation originalRelatedConcept
                    = (InferenceAnnotation) blackboard.getAnnotations(
                            WikipediaInferenceAnnotator.RELATED).get(i);

            related.setConcept(originalRelatedConcept.getConcept());
            related.setConceptPath(originalRelatedConcept.getUri().toASCIIString());
            related.setScore(originalRelatedConcept.getScore());
        }

        output.initializeHypernyms(blackboard.getAnnotations(
                WikipediaInferenceAnnotator.HYPERNYMS).size());

        for (int i = 0; i < output.getHypernyms().length; i++) {
            InferredConcept hypernym = output.getHypernyms()[i];
            InferenceAnnotation originalHypernym = (InferenceAnnotation) blackboard.getAnnotations(
                    WikipediaInferenceAnnotator.HYPERNYMS).get(i);

            hypernym.setConcept(originalHypernym.getConcept());
            hypernym.setConceptPath(originalHypernym.getUri().toASCIIString());
            hypernym.setScore(originalHypernym.getScore());
        }

        output.setExtractionCompleted(true);

        return output;
    }
}