Gram.java example

Explorer

distiller-CORE-master
- src
  - main
    - java
      - it
        uniud
        ailab
        dcore
        Blackboard.java
        DistilledOutput.java
        Distiller.java
        DistillerException.java
        DistillerFactory.java
        Pipeline.java
        Stage.java
        annotation
        Annotable.java
        Annotation.java
        AnnotationException.java
        Annotator.java
        DefaultAnnotations.java
        annotations
        CoreferenceChainAnnotation.java
        FeatureAnnotation.java
        InferenceAnnotation.java
        NERAnnotation.java
        ScoredAnnotation.java
        TextAnnotation.java
        UriAnnotation.java
        annotators
        ChunkingNerAnnotator.java
        CoreferenceResolverAnnotator.java
        DocumentPhraseMaximalityAnnotator.java
        GenericEvaluatorAnnotator.java
        GenericNGramGeneratorAnnotator.java
        GenericWikipediaAnnotator.java
        GramMergerAnnotator.java
        ItalianLemmatizerAnnotator.java
        LinearEvaluatorAnnotator.java
        PorterStemmerAnnotator.java
        RawTdidfAnnotator.java
        RegexNGramGeneratorAnnotator.java
        SimpleAnnotationFilterAnnotator.java
        SimpleCutFilterAnnotator.java
        SimpleNGramGeneratorAnnotator.java
        SkylineGramFilterAnnotator.java
        StatisticalAnnotator.java
        StopwordSimpleFilterAnnotator.java
        SyuzhetAnnotator.java
        TagMeGramAnnotator.java
        TagMeTokenAnnotator.java
        WikipediaInferenceAnnotator.java
        eval
        Evaluator.java
        GenericDataset.java
        TrainingSetGenerator.java
        datasets
        SemEval2010.java
        kp
        KeyphraseEvaluator15.java
        KeyphraseEvaluatorAll.java
        training
        KeyphraseTrainingSetGenerator.java
        io
        CsvPrinter.java
        FileWriterStage.java
        GenericSheetPrinter.java
        GramPrinter.java
        IOBlackboard.java
        SentencePrinter.java
        TokenPrinter.java
        launchers
        Launcher.java
        SampleInference.java
        SimpleKE.java
        StanfordKE.java
        persistence
        DocumentComponent.java
        DocumentComposite.java
        Gram.java
        Keyphrase.java
        Mention.java
        Sentence.java
        Token.java
        utils
        BlackboardUtils.java
        DocumentUtils.java
        Either.java
        FileSystem.java
        GramUtils.java
        ListUtils.java
        Pair.java
        SnowballStemmerSelector.java
        StageUtils.java
        WikipediaUtils.java
        wrappers
        external
        CybozuLanguageDetectorAnnotator.java
        OpenNlpBootstrapperAnnotator.java
        RCallerEvaluator.java
        StanfordBootstrapperAnnotator.java
        StanfordFastBootstrapperAnnotator.java
  - test
    - java
      - test.java

/*
 * Copyright (C) 2015 Artificial Intelligence
 * Laboratory @ University of Udine.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package it.uniud.ailab.dcore.persistence;

import com.fasterxml.jackson.annotation.JsonIgnore;
import it.uniud.ailab.dcore.annotation.Annotable;
import it.uniud.ailab.dcore.utils.ListUtils;
import java.util.ArrayList;
import java.util.List;

/**
 * A generic n-gram, a simple list of n words.
 * 
 * @author Marco Basaldella
 * @author Giorgia Chiaradia
 */
//@JsonIgnoreProperties({"surfaces,tokenLists,appaerances"})
public abstract class Gram extends Annotable {
    
    /**
     * The type of n-gram:it can be a concept, a keyphrase, a mention...
     */
    private final String type;
    
    /**
     * The different list of words forming the surface of the gram.
     */
    private List<List<Token>> tokenLists;
    
    /**
     * The different string representation of the gram.
     */
    private final List<String> surfaces;
    
    /**
     * The concept Units in which the gram appears.
     */
    private List<DocumentComponent> appareances;
    
    /**
     * The identifier for a GRAM object.
     */
    public static final String GRAM = "GRAM";
    
    /**
     * Instantiates an n-gram. Usually, the surface should be simply the the 
     * concatenation of the text of the tokens. The signature can be used for 
     * comparison, so be sure to generate different signatures for n-grams
     * that are different in your domain. For example, you can use the sequence 
     * of the stems of the tokens, so that n-grams with the same stemmed form 
     * are considered equal.
     * 
     *
     * @param sequence the tokens that form the gram
     * @param identifier unique identifier of the gram.
     * @param surface the pretty-printed string representation of the gram
     * @param type the type of gram that will be generated.
     */
    public Gram(String identifier, List<Token> sequence, String surface, 
            String type) {
        
        super(identifier);
        
        this.type = type;
        
        tokenLists = new ArrayList<>();
        tokenLists.add(sequence);
        
        surfaces = new ArrayList<>();
        surfaces.add(surface);
    }
    
    /**
     * Adds a surface to the n-gram. Duplicates are permitted.
     * 
     * @param surface the surface to add
     * @param tokens the tokens that form the surface
     */
    public void addSurface(String surface,List<Token> tokens) {
        surfaces.add(surface);
        tokenLists.add(tokens);
    }
    
     /**
     * Adds a group of surfaces to the n-gram. Duplicates are permitted.
     * 
     * @param surfaces the surface to add
     * @param tokenLists the tokens that form the surface
     */
    public void addSurfaces(List<String> surfaces,List<List<Token>> tokenLists) {
        
        if (surfaces.size() != tokenLists.size())
            throw new IllegalArgumentException(
                "Mismatching size of surfaces and token lists.");
        
        this.surfaces.addAll(surfaces);
        
        // note: do not use addAll. The references are lost if you don't copy
        for (List<Token> t : tokenLists) {
            this.tokenLists.add(new ArrayList<Token>(t));
        }
    }
    
    /**
     * Get the type of the Gram that depends on the type of Gram implementation.
     * 
     * @return the type of gram. 
     */
    public String getType(){
        return type;
    }

    /**
     * The tokens that form the most common surface of the gram.
     *
     * @return the tokens of the surface of the gram.
     */
    public List<Token> getTokens() {
        return tokenLists.get(surfaces.indexOf(ListUtils.mostCommon(surfaces)));
    }
    
    /**
     * Returns all the possible lists of tokens that form the gram.
     * 
     * @return all the possible lists of tokens that form the gram.
     */
    @JsonIgnore
    public List<List<Token>> getTokenLists() {
        return tokenLists;
    }

    /**
     * The human-readable form of the gram. This is the most common surface
     * between all the surfaces associated with the gram; if there are more than
     * one, the first one that has been added to the gram is selected.
     *
     * @return the human-readable form of the gram.
     */
    public String getSurface() {
        return ListUtils.mostCommon(surfaces);
    }
    
    /**
     * Returns all the surfaces of the gram. Note: may contain 
     * duplicates.
     * 
     * @return all the surfaces of the gram.
     */
    @JsonIgnore
    public List<String> getSurfaces() {
        return surfaces;
    }
    
    /**
     * Adds an appearance of the gram; in other words, adds the component in
     * which the gram appears to the list of the appearances.
     *
     * @param component the component in which the gram appears
     */
    public void addAppaerance(DocumentComponent component) {
        appareances.add(component);
    }

    /**
     * Gets all the components in which the gram appears.
     *
     * @return all the components in which the gram appears.
     */
    @JsonIgnore
    public List<DocumentComponent> getAppaerances() {
        return appareances;
    }
    
    
    /**
     * The identifier of the gram. Please note that it is possible that two
     * grams with different surface or tokens may have the same identifier, 
     * based on the policy of the class that generated the gram.
     * 
     * For example, "italian" and "Italy" may have the same identifier, because
     * the identifier has been generated using the same stem "ital". Otherwise,
     * the identifier may be the same link on an external ontology: in this 
     * case, both words may have been associated with the entity "Italy".
     * 
     *
     * @return the signature of the gram.
     */
    @Override
    public String getIdentifier() {
        return super.getIdentifier();
    }
    
}