Document.java example

Explorer
CoreNLP-master
//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2010 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//

package edu.stanford.nlp.coref.data;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntTuple;
import edu.stanford.nlp.util.Pair;

public class Document implements Serializable {

  private static final long serialVersionUID = -4139866807494603953L;

  public enum DocType { CONVERSATION, ARTICLE }

  /** The type of document: conversational or article */
  public DocType docType;

  /** Document annotation */
  public Annotation annotation;

  /** for conll shared task 2011  */
  public CoNLLDocumentReader.CoNLLDocument conllDoc;

  /** The list of gold mentions */
  public List<List<Mention>> goldMentions;
  /** The list of predicted mentions */
  public List<List<Mention>> predictedMentions;

  /** return the list of predicted mentions */
  public List<List<Mention>> getOrderedMentions() {
    return predictedMentions;
  }

  /** Clusters for coreferent mentions */
  public Map<Integer, CorefCluster> corefClusters;

  /** Gold Clusters for coreferent mentions */
  public Map<Integer, CorefCluster> goldCorefClusters;

  /** All mentions in a document {@literal mentionID -> mention} */
  public Map<Integer, Mention> predictedMentionsByID;
  public Map<Integer, Mention> goldMentionsByID;

  /** Set of roles (in role apposition) in a document  */
  public Set<Mention> roleSet;

  /**
   * Position of each mention in the input matrix
   * Each mention occurrence with sentence # and position within sentence
   * (Nth mention, not Nth token)
   */
  public Map<Mention, IntTuple> positions;              // mentions may be removed from this due to post processing
  public Map<Mention, IntTuple> allPositions;           // all mentions (mentions will not be removed from this)

  public final Map<IntTuple, Mention> mentionheadPositions;

  /** List of gold links in a document by positions */
  private List<Pair<IntTuple,IntTuple>> goldLinks;

  /** UtteranceAnnotation {@literal ->} String (speaker): mention ID or speaker string
   *   e.g., the value can be "34" (mentionID), "Larry" (speaker string), or "PER3" (autoassigned speaker string)
   */
  public Map<Integer, String> speakers;

  /** Pair of mention id, and the mention's speaker id
   *  the second value is the "speaker mention"'s id.
   *  e.g., Larry said, "San Francisco is a city.": (id(Larry), id(San Francisco))
   */
  public Set<Pair<Integer, Integer>> speakerPairs;

  public boolean speakerInfoGiven;

  public int maxUtter;
  public int numParagraph;
  public int numSentences;

  /** Set of incompatible clusters pairs */
  private final Set<Pair<Integer, Integer>> incompatibles;
  private final Set<Pair<Integer, Integer>> incompatibleClusters;

  public Map<Pair<Integer, Integer>, Boolean> acronymCache;

  /** Map of speaker name/id to speaker info
   *  the key is the value of the variable 'speakers'
   */
  public Map<String, SpeakerInfo> speakerInfoMap = Generics.newHashMap();

  // public Counter<String> properNouns = new ClassicCounter<>();
  // public Counter<String> phraseCounter = new ClassicCounter<>();
  // public Counter<String> headwordCounter = new ClassicCounter<>();

  /** Additional information about the document. Can be used as features */
  public Map<String, String> docInfo;

  public Document() {
    positions = Generics.newHashMap();
    mentionheadPositions = Generics.newHashMap();
    roleSet = Generics.newHashSet();
    corefClusters = Generics.newHashMap();
    goldCorefClusters = null;
    predictedMentionsByID = Generics.newHashMap();
//    goldMentionsByID = Generics.newHashMap();
    speakers = Generics.newHashMap();
    speakerPairs = Generics.newHashSet();
    incompatibles = Generics.newHashSet();
    incompatibleClusters = Generics.newHashSet();
    acronymCache = Generics.newHashMap();
  }

  public Document(Annotation anno, List<List<Mention>> predictedMentions, List<List<Mention>> goldMentions) {
    this();
    annotation = anno;
    this.predictedMentions = predictedMentions;
    this.goldMentions = goldMentions;
  }

  public Document(InputDoc input, List<List<Mention>> mentions) {
    this();
    this.annotation = input.annotation;
    this.predictedMentions = mentions;
    this.goldMentions = input.goldMentions;
    this.docInfo = input.docInfo;
    this.numSentences = input.annotation.get(SentencesAnnotation.class).size();
    this.conllDoc = input.conllDoc;   // null if it's not conll input
  }

  public boolean isIncompatible(CorefCluster c1, CorefCluster c2) {
    // Was any of the pairs of mentions marked as incompatible
    int cid1 = Math.min(c1.clusterID, c2.clusterID);
    int cid2 = Math.max(c1.clusterID, c2.clusterID);
    return incompatibleClusters.contains(Pair.makePair(cid1,cid2));
  }

  // Update incompatibles for two clusters that are about to be merged
  public void mergeIncompatibles(CorefCluster to, CorefCluster from) {
    List<Pair<Pair<Integer,Integer>, Pair<Integer,Integer>>> replacements =
            new ArrayList<>();
    for (Pair<Integer, Integer> p:incompatibleClusters) {
      Integer other = null;
      if (p.first == from.clusterID) {
        other = p.second;
      } else if (p.second == from.clusterID) {
        other = p.first;
      }
      if (other != null && other != to.clusterID) {
        int cid1 = Math.min(other, to.clusterID);
        int cid2 = Math.max(other, to.clusterID);
        replacements.add(Pair.makePair(p, Pair.makePair(cid1, cid2)));
      }
    }
    for (Pair<Pair<Integer,Integer>, Pair<Integer,Integer>> r:replacements)  {
      incompatibleClusters.remove(r.first);
      incompatibleClusters.add(r.second);
    }
  }
  public void mergeAcronymCache(CorefCluster to, CorefCluster from) {
    Map<Pair<Integer, Integer>, Boolean> replacements = Generics.newHashMap();
    for(Pair<Integer, Integer> p : acronymCache.keySet()) {
      if(acronymCache.get(p)) {
        Integer other = null;
        if(p.first==from.clusterID){
          other = p.second;
        } else if(p.second==from.clusterID) {
          other = p.first;
        }
        if(other != null && other != to.clusterID) {
          int cid1 = Math.min(other, to.clusterID);
          int cid2 = Math.max(other, to.clusterID);
          replacements.put(Pair.makePair(cid1, cid2), true);
        }
      }
    }
    for(Pair<Integer, Integer> p : replacements.keySet()) {
      acronymCache.put(p, replacements.get(p));
    }
  }

  public boolean isIncompatible(Mention m1, Mention m2) {
    int mid1 = Math.min(m1.mentionID, m2.mentionID);
    int mid2 = Math.max(m1.mentionID, m2.mentionID);
    return incompatibles.contains(Pair.makePair(mid1,mid2));
  }

  public void addIncompatible(Mention m1, Mention m2) {
    int mid1 = Math.min(m1.mentionID, m2.mentionID);
    int mid2 = Math.max(m1.mentionID, m2.mentionID);
    incompatibles.add(Pair.makePair(mid1,mid2));
    int cid1 = Math.min(m1.corefClusterID, m2.corefClusterID);
    int cid2 = Math.max(m1.corefClusterID, m2.corefClusterID);
    incompatibleClusters.add(Pair.makePair(cid1,cid2));
  }

  public List<Pair<IntTuple, IntTuple>> getGoldLinks() {
    if(goldLinks==null) this.extractGoldLinks();
    return goldLinks;
  }

  /** Extract gold coref link information */
  protected void extractGoldLinks() {
    //    List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
    List<Pair<IntTuple, IntTuple>> links = new ArrayList<>();

    // position of each mention in the input matrix, by id
    Map<Integer, IntTuple> positions = Generics.newHashMap();
    // positions of antecedents
    Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap();
    for(int i = 0; i < goldMentions.size(); i ++){
      for(int j = 0; j < goldMentions.get(i).size(); j ++){
        Mention m = goldMentions.get(i).get(j);
        int id = m.mentionID;
        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(id, pos);
        antecedents.put(id, new ArrayList<>());
      }
    }

//    SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
    for (List<Mention> mentions : goldMentions) {
      for (Mention m : mentions) {
        int id = m.mentionID;
        IntTuple src = positions.get(id);

        assert (src != null);
        if (m.originalRef >= 0) {
          IntTuple dst = positions.get(m.originalRef);
          if (dst == null) {
            throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
          }

          // to deal with cataphoric annotation
          while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
            Mention dstMention = goldMentions.get(dst.get(0)).get(dst.get(1));
            m.originalRef = dstMention.originalRef;
            dstMention.originalRef = id;

            if (m.originalRef < 0) break;
            dst = positions.get(m.originalRef);
          }
          if (m.originalRef < 0) continue;

          // A B C: if A<-B, A<-C => make a link B<-C
          for (int k = dst.get(0); k <= src.get(0); k++) {
            for (int l = 0; l < goldMentions.get(k).size(); l++) {
              if (k == dst.get(0) && l < dst.get(1)) continue;
              if (k == src.get(0) && l > src.get(1)) break;
              IntTuple missed = new IntTuple(2);
              missed.set(0, k);
              missed.set(1, l);
              if (links.contains(new Pair<>(missed, dst))) {
                antecedents.get(id).add(missed);
                links.add(new Pair<>(src, missed));
              }
            }
          }

          links.add(new Pair<>(src, dst));

          assert (antecedents.get(id) != null);
          antecedents.get(id).add(dst);

          List<IntTuple> ants = antecedents.get(m.originalRef);
          assert (ants != null);
          for (IntTuple ant : ants) {
            antecedents.get(id).add(ant);
            links.add(new Pair<>(src, ant));
          }
        }
      }
    }
    goldLinks = links;
  }

  public SpeakerInfo getSpeakerInfo(String speaker) {
    return speakerInfoMap.get(speaker);
  }

  public int numberOfSpeakers() {
    return speakerInfoMap.size();
  }

  public boolean isCoref(Mention m1, Mention m2) {
    return this.goldMentionsByID.containsKey(m1.mentionID)
        && this.goldMentionsByID.containsKey(m2.mentionID)
        && this.goldMentionsByID.get(m1.mentionID).goldCorefClusterID == this.goldMentionsByID.get(m2.mentionID).goldCorefClusterID;
  }

}