/* * Copyright (C) 2015 Artificial Intelligence * Laboratory @ University of Udine. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package it.uniud.ailab.dcore; import com.fasterxml.jackson.annotation.JsonIgnore; import com.rits.cloning.Cloner; import java.util.HashMap; import java.util.Map; import it.uniud.ailab.dcore.persistence.DocumentComponent; import it.uniud.ailab.dcore.persistence.DocumentComposite; import it.uniud.ailab.dcore.annotation.Annotation; import it.uniud.ailab.dcore.persistence.Gram; import it.uniud.ailab.dcore.persistence.Keyphrase; import it.uniud.ailab.dcore.persistence.Sentence; import it.uniud.ailab.dcore.utils.DocumentUtils; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; /** * The BlackBoard that holds the document and all its annotations. In every part * of the extraction pipeline, every annotator will receive a piece of the * document contained in the blackboard and will write new information on it. * * The Blackboard has two main parts: one is the document root, which allows to * navigate the entire document; the other is the gram container, which contains * all the grams found in the document and their locations. * * @author Marco Basaldella * @author Dario De Nart */ public class Blackboard { /** * The default document identifier. */ private static final String DEFAULT_DOCUMENT_ID = "DocumentRoot"; /** * The full raw text of the document. */ private String rawText; /** * The root block of the document. */ private DocumentComponent document; /** * Container for the n-grams of the document. Every n-gram is part of a * specific list identifying the type of n-gram. The types of n-grams are * the key for searching in the main n-gram list. */ private Map<String, Map<String, Gram>> generalNGramsContainer; /** * Document-wide annotations. This space can be used to add annotations that * belong to the whole document, as for example extracted concepts, or * tagset used by a POS-tagger, or the overall sentiment. */ private List<Annotation> annotations; /** * Instantiates an empty blackboard. */ public Blackboard() { createDocument(""); } /** * Initializes the blackboard with a new document. This will destroy any * information previously held by the blackboard. * * @param rawText the text of the new document. * @param documentId the output-friendly identifier of the document */ public final void createDocument(String rawText, String documentId) { this.rawText = rawText; this.document = new DocumentComposite(rawText, documentId); this.generalNGramsContainer = new HashMap<>(); this.annotations = new ArrayList<>(); } /** * Initializes the blackboard with a new document. This will destroy any * information previously held by the blackboard. * * @param rawText the text of the new document. */ public final void createDocument(String rawText) { this.rawText = rawText; this.document = new DocumentComposite(rawText, DEFAULT_DOCUMENT_ID); this.generalNGramsContainer = new HashMap<>(); this.annotations = new ArrayList<>(); } /** * Gets the root of the document. * * @return the {@link it.uniud.ailab.dcore.persistence.DocumentComponent} * root object. */ @JsonIgnore public DocumentComponent getStructure() { return document; } /** * Gets the raw text (i.e. unprocessed) of the document. * * @return the original document string. */ public String getText() { return rawText; } /** * Adds a Gram in the Gram Container, merging grams with the same * identifier. If the gram is already present, the method updates it adding * the new occurrence. Annotations of the new gram are <b>not</b> merged * into the old gram. This is because it's good practice to annotate grams * only when they've <b>all</b> been added into the blackboard. * * @param unit the concept unit where the gram appears * @param newGram the gram to add */ public void addGram(DocumentComponent unit, Gram newGram) { Map<String, Gram> grams = generalNGramsContainer.get(newGram.getType()); if (grams == null) { grams = new HashMap<>(); } Gram gram = grams.get(newGram.getIdentifier()); // Deep clone the object instead of referencing the found one. // this way, we're free to modify it by adding annotations without // modifying the old object. if (gram == null) { Gram cloned = (new Cloner()).deepClone(newGram); grams.put(cloned.getIdentifier(), cloned); gram = cloned; } else { // add the surface of the new gram gram.addSurfaces(newGram.getSurfaces(), newGram.getTokenLists()); } gram.addAppaerance(unit); unit.addGram(gram); generalNGramsContainer.put(newGram.getType(), grams); } public void addGram(Gram newGram) { Map<String, Gram> grams = generalNGramsContainer.get(newGram.getType()); if (grams == null) { grams = new HashMap<>(); } Gram gram = grams.get(newGram.getIdentifier()); // Deep clone the object instead of referencing the found one. // this way, we're free to modify it by adding annotations without // modifying the old object. if (gram == null) { Gram cloned = (new Cloner()).deepClone(newGram); grams.put(cloned.getIdentifier(), cloned); } else { // add the surface of the new gram gram.addSurfaces(newGram.getSurfaces(), newGram.getTokenLists()); } generalNGramsContainer.put(newGram.getType(), grams); } /** * Get the all the different kind of grams found in the document. This * grams are divided by type, stored in a Map using their identifier as * key. * * @return all the maps found in the document. */ public Map<String,Map<String,Gram>> getGrams() { return generalNGramsContainer; } /** * Get all the grams of a given type found in the blackboard. * * @param <T> the type of the grams to achieve * @param gramType the identifier of the gram type * @return a collection with all the grams with match type and identifier, * null if there is no match. */ public <T> Collection<T> getGramsByType(String gramType) { return generalNGramsContainer.containsKey(gramType) ? (Collection<T>)generalNGramsContainer.get(gramType).values() : null; } /** * Retrieves the keyphrases found in the document. * * @return a collection of * {@link it.uniud.ailab.dcore.persistence.Keyphrase}s. */ @Deprecated @JsonIgnore public List<Gram> getKeyphrases() { Map<String, Gram> kps = generalNGramsContainer.get(Keyphrase.KEYPHRASE); return kps != null ? new ArrayList(kps.values()) : new ArrayList(); } /** * Removes a keyphrase from the document because it's no more relevant, or * useful, or for whatever reason an annotator thinks so. * * @param g the gram to remove. */ @Deprecated public void removeKeyphrase(Keyphrase g) { removeGram(Keyphrase.KEYPHRASE,g); } /** * Removes a gram from the document because it's no more relevant, or * useful, or for whatever reason an annotator thinks so. * * @param type the type of the gram to remove * @param g the gram to remove. */ public void removeGram(String type,Gram g) { generalNGramsContainer.get(type) .remove(g.getIdentifier()); for (Sentence s : DocumentUtils.getSentences(document)) { s.removeGram(g); } } /** * Adds an annotation in the blackboard. * * @param a the annotation to add */ public void addAnnotation(Annotation a) { annotations.add(a); } /** * Get all the annotations. * * @return the annotations stored in the blackboard */ public List<Annotation> getAnnotations() { return annotations; } /** * Gets the annotations produced by a specific annotator. * * @param annotator the annotator identifier * @return the annotations produced by the specified annotator */ public List<Annotation> getAnnotations(String annotator) { return annotations.stream().filter((a) -> (a.getAnnotator().equals(annotator))). collect(Collectors.toList()); } /** * Remove an annotation from the blackboard. * * @param ann the annotation to remove. */ public void removeAnnotation(Annotation ann) { annotations.remove(ann); } /** * Get the language of the document root. * * @return the language of the document root. */ public String getDocumentLanguage() { return getStructure().getLanguage().getLanguage(); } }