/* * Copyright (C) 2015 Artificial Intelligence * Laboratory @ University of Udine. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package it.uniud.ailab.dcore.persistence; import com.fasterxml.jackson.annotation.JsonIgnore; import it.uniud.ailab.dcore.annotation.Annotable; import it.uniud.ailab.dcore.utils.ListUtils; import java.util.ArrayList; import java.util.List; /** * A generic n-gram, a simple list of n words. * * @author Marco Basaldella * @author Giorgia Chiaradia */ //@JsonIgnoreProperties({"surfaces,tokenLists,appaerances"}) public abstract class Gram extends Annotable { /** * The type of n-gram:it can be a concept, a keyphrase, a mention... */ private final String type; /** * The different list of words forming the surface of the gram. */ private List<List<Token>> tokenLists; /** * The different string representation of the gram. */ private final List<String> surfaces; /** * The concept Units in which the gram appears. */ private List<DocumentComponent> appareances; /** * The identifier for a GRAM object. */ public static final String GRAM = "GRAM"; /** * Instantiates an n-gram. Usually, the surface should be simply the the * concatenation of the text of the tokens. The signature can be used for * comparison, so be sure to generate different signatures for n-grams * that are different in your domain. For example, you can use the sequence * of the stems of the tokens, so that n-grams with the same stemmed form * are considered equal. * * * @param sequence the tokens that form the gram * @param identifier unique identifier of the gram. * @param surface the pretty-printed string representation of the gram * @param type the type of gram that will be generated. */ public Gram(String identifier, List<Token> sequence, String surface, String type) { super(identifier); this.type = type; tokenLists = new ArrayList<>(); tokenLists.add(sequence); surfaces = new ArrayList<>(); surfaces.add(surface); } /** * Adds a surface to the n-gram. Duplicates are permitted. * * @param surface the surface to add * @param tokens the tokens that form the surface */ public void addSurface(String surface,List<Token> tokens) { surfaces.add(surface); tokenLists.add(tokens); } /** * Adds a group of surfaces to the n-gram. Duplicates are permitted. * * @param surfaces the surface to add * @param tokenLists the tokens that form the surface */ public void addSurfaces(List<String> surfaces,List<List<Token>> tokenLists) { if (surfaces.size() != tokenLists.size()) throw new IllegalArgumentException( "Mismatching size of surfaces and token lists."); this.surfaces.addAll(surfaces); // note: do not use addAll. The references are lost if you don't copy for (List<Token> t : tokenLists) { this.tokenLists.add(new ArrayList<Token>(t)); } } /** * Get the type of the Gram that depends on the type of Gram implementation. * * @return the type of gram. */ public String getType(){ return type; } /** * The tokens that form the most common surface of the gram. * * @return the tokens of the surface of the gram. */ public List<Token> getTokens() { return tokenLists.get(surfaces.indexOf(ListUtils.mostCommon(surfaces))); } /** * Returns all the possible lists of tokens that form the gram. * * @return all the possible lists of tokens that form the gram. */ @JsonIgnore public List<List<Token>> getTokenLists() { return tokenLists; } /** * The human-readable form of the gram. This is the most common surface * between all the surfaces associated with the gram; if there are more than * one, the first one that has been added to the gram is selected. * * @return the human-readable form of the gram. */ public String getSurface() { return ListUtils.mostCommon(surfaces); } /** * Returns all the surfaces of the gram. Note: may contain * duplicates. * * @return all the surfaces of the gram. */ @JsonIgnore public List<String> getSurfaces() { return surfaces; } /** * Adds an appearance of the gram; in other words, adds the component in * which the gram appears to the list of the appearances. * * @param component the component in which the gram appears */ public void addAppaerance(DocumentComponent component) { appareances.add(component); } /** * Gets all the components in which the gram appears. * * @return all the components in which the gram appears. */ @JsonIgnore public List<DocumentComponent> getAppaerances() { return appareances; } /** * The identifier of the gram. Please note that it is possible that two * grams with different surface or tokens may have the same identifier, * based on the policy of the class that generated the gram. * * For example, "italian" and "Italy" may have the same identifier, because * the identifier has been generated using the same stem "ital". Otherwise, * the identifier may be the same link on an external ontology: in this * case, both words may have been associated with the entity "Italy". * * * @return the signature of the gram. */ @Override public String getIdentifier() { return super.getIdentifier(); } }