SemanticGraphUtils.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.semgraph;
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.AnnotationLookup;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.LabeledWord;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.MapList;
import edu.stanford.nlp.util.Pair;

import java.io.StringWriter;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Pattern;


/**
 * Generic utilities for dealing with Dependency graphs and other structures, useful for
 * text simplification and rewriting.
 *
 * TODO: Migrate some of the functions (that make sense) into SemanticGraph proper.
 * BUT BEWARE: This class has methods that use jgraph (as opposed to jgrapht).
 * We don't want our core code to become dependent on jgraph, so methods in
 * SemanticGraph shouldn't call methods in this class, and methods that use
 * jgraph shouldn't be moved into SemanticGraph.
 *
 * @author Eric Yeh
 *
 */
public class SemanticGraphUtils  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(SemanticGraphUtils.class);

  private SemanticGraphUtils() {}

  /**
   * Given a collection of nodes from srcGraph, generates a new
   * SemanticGraph based off the subset represented by those nodes.
   * This uses the same vertices as in the original graph, which
   * allows for equality and comparisons between the two graphs.
   */
  public static SemanticGraph makeGraphFromNodes(Collection<IndexedWord> nodes, SemanticGraph srcGraph) {
    if (nodes.size() == 1) {
      SemanticGraph retSg = new SemanticGraph();
      for (IndexedWord node :nodes)
        retSg.addVertex(node);
      return retSg;
    }

    if (nodes.isEmpty()) {
      return null;
    }

    // TODO: if any nodes are not connected to edges in the original
    // graph, this will leave them out
    List<SemanticGraphEdge> edges = new ArrayList<>();
    for (IndexedWord nodeG : nodes) {
      for (IndexedWord nodeD: nodes) {
        Collection<SemanticGraphEdge> existingEdges =
          srcGraph.getAllEdges(nodeG, nodeD);
        if (existingEdges != null) {
          edges.addAll(existingEdges);
        }
      }
    }
    return SemanticGraphFactory.makeFromEdges(edges);
  }

  //----------------------------------------------------------------------------------------
  //Query routines (obtaining sets of edges/vertices over predicates, etc)
  //----------------------------------------------------------------------------------------

  /**
   * Finds the vertex in the given SemanticGraph that corresponds to the given node.
   * Returns null if cannot find. Uses first match on index, sentIndex, and word values.
   */
  public static IndexedWord findMatchingNode(IndexedWord node,
                                             SemanticGraph sg) {
    for (IndexedWord tgt : sg.vertexSet()) {
      if ((tgt.index() == node.index()) &&
          (tgt.sentIndex() == node.sentIndex()) &&
          (tgt.word().equals(node.word())) )
        return tgt;
    }
    return null;
  }


  /**
   * Given a starting vertice, grabs the subtree encapsulated by portion of the semantic graph, excluding
   * a given edge.  A tabu list is maintained, in order to deal with cyclical relations (such as between a
   * rcmod (relative clause) and its nsubj).
   *
   */
  public static Set<SemanticGraphEdge> getSubTreeEdges(IndexedWord vertice, SemanticGraph sg, SemanticGraphEdge excludedEdge) {
    Set<SemanticGraphEdge> tabu = Generics.newHashSet();
    tabu.add(excludedEdge);
    getSubTreeEdgesHelper(vertice, sg, tabu);
    tabu.remove(excludedEdge); // Do not want this in the returned edges
    return tabu;
  }

  public static void getSubTreeEdgesHelper(IndexedWord vertice, SemanticGraph sg, Set<SemanticGraphEdge> tabuEdges) {
    for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertice)) {
      if (!tabuEdges.contains(edge)) {
        IndexedWord dep = edge.getDependent();
        tabuEdges.add(edge);
        getSubTreeEdgesHelper(dep, sg, tabuEdges);
      }
    }
  }


  /**
   * Given a set of nodes from a SemanticGraph, returns the set of
   * edges that are spanned between these nodes.
   */
  public static Collection<SemanticGraphEdge> getEdgesSpannedByVertices(Collection<IndexedWord> nodes, SemanticGraph sg) {
    Collection<SemanticGraphEdge> ret = Generics.newHashSet();
    for (IndexedWord n1 : nodes)
      for (IndexedWord n2: nodes) {
        if (n1 != n2) {
          Collection<SemanticGraphEdge> edges = sg.getAllEdges(n1, n2);
          if (edges != null) ret.addAll(edges);
        }
      }
    return ret;
  }

  /**
   * Returns a list of all children bearing a grammatical relation starting with the given string, relnPrefix
   */
  public static List<IndexedWord> getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, String relnPrefix) {
    if (vertex.equals(IndexedWord.NO_WORD))
      return new ArrayList<>();
    if (!graph.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    List<IndexedWord> childList = new ArrayList<>();
    for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
      if (edge.getRelation().toString().startsWith(relnPrefix)) {
        childList.add(edge.getTarget());
      }
    }
    return childList;
  }

  /**
   * Returns a list of all children bearing a grammatical relation starting with the given set of relation prefixes
   */
  public static List<IndexedWord> getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, Collection<String> relnPrefixes) {
    if (vertex.equals(IndexedWord.NO_WORD))
      return new ArrayList<>();
    if (!graph.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    List<IndexedWord> childList = new ArrayList<>();
    for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
      String edgeString = edge.getRelation().toString();
      for (String relnPrefix : relnPrefixes) {
        if (edgeString.startsWith(relnPrefix)) {
          childList.add(edge.getTarget());
          break;
        }
      }
    }
    return childList;
  }

  /**
   * Since graphs can be have preps collapsed, finds all the immediate children of this node
   * that are linked by a collapsed preposition edge.
   */
  public static List<IndexedWord> getChildrenWithPrepC(SemanticGraph sg, IndexedWord vertex) {
    List<IndexedWord> ret = new ArrayList<>();
    //  Collection<GrammaticalRelation> prepCs = EnglishGrammaticalRelations.getPrepsC();
    //  for (SemanticGraphEdge edge : sg.outgoingEdgesOf(vertex)) {
    //  if (prepCs.contains(edge.getRelation()))
    for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertex)) {
      if (edge.getRelation().toString().startsWith("prep"))
        ret.add(edge.getDependent());
    }
    return ret;
  }

  /**
   * Returns the set of incoming edges for the given node that have the given
   * relation.
   *
   * Because certain edges may remain in string form (prepcs), check for both
   * string and object form of relations.
   */
  public static List<SemanticGraphEdge> incomingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
    return edgesWithReln(sg.incomingEdgeIterable(node), reln);
  }

  /**
   * Checks for outgoing edges of the node, in the given graph, which contain
   * the given relation.  Relations are matched on if they are GrammaticalRelation
   * objects or strings.
   */
  public static List<SemanticGraphEdge> outgoingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
    return edgesWithReln(sg.outgoingEdgeIterable(node), reln);
  }

  /**
   * Given a list of edges, returns those which match the given relation (can be string or
   * GrammaticalRelation object).
   */
  public static List<SemanticGraphEdge> edgesWithReln(Iterable<SemanticGraphEdge> edges,
                                                      GrammaticalRelation reln) {
    List<SemanticGraphEdge> found = Generics.newArrayList();
    for (SemanticGraphEdge edge : edges) {
      GrammaticalRelation tgtReln = edge.getRelation();
      if (tgtReln.equals(reln)) {
        found.add(edge);
      }
    }
    return found;
  }

  /**
   * Given a semantic graph, and a relation prefix, returns a list of all relations (edges)
   * that start with the given prefix (e.g., prefix "prep" gives you all the prep relations: prep_by, pref_in,etc.)
   *
   */
  public static List<SemanticGraphEdge> findAllRelnsWithPrefix(SemanticGraph sg, String prefix) {
    ArrayList<SemanticGraphEdge> relns = new ArrayList<>();
    for (SemanticGraphEdge edge : sg.edgeIterable()) {
      GrammaticalRelation edgeRelation = edge.getRelation();
      if (edgeRelation.toString().startsWith(prefix)) {
        relns.add(edge);
      }
    }
    return relns;
  }

  /**
   * Finds the descendents of the given node in graph, avoiding the given set of nodes
   */
  public static Set<IndexedWord> tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection<IndexedWord> tabu) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set<IndexedWord> descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, tabu, null, null);
    return descendantSet;
  }

  /**
   * Finds the set of descendants for a node in the graph, avoiding the set of nodes and the
   * set of edge relations.  NOTE: these edges are encountered from the downward cull,
   * from governor to dependent.
   */
  public static Set<IndexedWord> tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection<IndexedWord> tabu,
                                                 Collection<GrammaticalRelation> tabuRelns) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set<IndexedWord> descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, tabu, tabuRelns, null);
    return descendantSet;
  }

  public static Set<IndexedWord> descendantsTabuRelns(SemanticGraph sg, IndexedWord vertex,
                                                      Collection<GrammaticalRelation> tabuRelns) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set<IndexedWord> descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, Generics.<IndexedWord>newHashSet(), tabuRelns, null);
    return descendantSet;
  }

  public static Set<IndexedWord> descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
      Collection<GrammaticalRelation> tabuRelns, IndexedWordUnaryPred tabuTest) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set<IndexedWord> descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, Generics.<IndexedWord>newHashSet(), tabuRelns, tabuTest);
    return descendantSet;
  }

  public static Set<IndexedWord> descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
      Collection<IndexedWord> tabuNodes, Collection<GrammaticalRelation> tabuRelns, IndexedWordUnaryPred tabuTest) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set<IndexedWord> descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, tabuNodes, tabuRelns, tabuTest);
    return descendantSet;
  }



  /**
   * Performs a cull for the descendents of the given node in the
   * graph, subject to the tabu nodes to avoid, relations to avoid
   * crawling over, and child nodes to avoid traversing to based upon
   * a predicate test.
   */
  private static void tabuDescendantsHelper(SemanticGraph sg, IndexedWord curr, Set<IndexedWord> descendantSet, Collection<IndexedWord> tabu,
      Collection<GrammaticalRelation> relnsToAvoid, IndexedWordUnaryPred tabuTest) {
    if (tabu.contains(curr))
      return;
    if (descendantSet.contains(curr)) {
      return;
    }

    descendantSet.add(curr);
    for (IndexedWord child : sg.getChildren(curr)) {
      for (SemanticGraphEdge edge : sg.getAllEdges(curr, child)) {
        if (relnsToAvoid != null && relnsToAvoid.contains(edge.getRelation()))
          continue;
        if (tabuTest != null && tabuTest.test(edge.getDependent(), sg))
          continue;
        tabuDescendantsHelper(sg, child, descendantSet, tabu, relnsToAvoid,
                              tabuTest);
      }
    }
  }


  //------------------------------------------------------------------------------------
  //"Constituent" extraction and manipulation
  //------------------------------------------------------------------------------------


  /**
   * Returns the vertice that is "leftmost."  Note this requires that the IndexedFeatureLabels present actually have
   * ordering information.
   * TODO: can be done more efficiently?
   */
  public static IndexedWord leftMostChildVertice(IndexedWord startNode, SemanticGraph sg) {
    TreeSet<IndexedWord> vertices = new TreeSet<>();
    for (IndexedWord vertex : sg.descendants(startNode)) {
      vertices.add(vertex);
    }
    return vertices.first();
  }

  /**
   * Returns the vertices that are "leftmost, rightmost"  Note this requires that the IndexedFeatureLabels present actually have
   * ordering information.
   * TODO: can be done more efficiently?
   */
  public static Pair<IndexedWord, IndexedWord> leftRightMostChildVertices(IndexedWord startNode, SemanticGraph sg) {
    TreeSet<IndexedWord> vertices = new TreeSet<>();
    for (IndexedWord vertex : sg.descendants(startNode)) {
      vertices.add(vertex);
    }
    return Pair.makePair(vertices.first(), vertices.last());
  }

  /**
   * Given a SemanticGraph, and a set of nodes, finds the "blanket" of nodes that are one
   * edge away from the set of nodes passed in.  This is similar to the idea of a Markov
   * Blanket, except in the context of a SemanticGraph.
   * TODO: optimize
   */
  public static Collection<IndexedWord> getDependencyBlanket(SemanticGraph sg, Collection<IndexedWord> assertedNodes) {
    Set<IndexedWord> retSet = Generics.newHashSet();
    for (IndexedWord curr : sg.vertexSet()) {
      if (!assertedNodes.contains(curr) && !retSet.contains(curr)) {
        for (IndexedWord assertedNode : assertedNodes) {
          if (sg.containsEdge(assertedNode, curr) ||
              sg.containsEdge(curr, assertedNode)) {
            retSet.add(curr);
          }
        }
      }
    }
    return retSet;
  }

  /**
   * Resets the indices for the vertices in the graph, using the current
   * ordering returned by vertexList (presumably in order).  This is to ensure
   * accesses to the InfoFile word table do not fall off after a SemanticGraph has
   * been edited.
   * <br>
   * NOTE: the vertices will be replaced, as JGraphT does not permit
   * in-place modification of the nodes.  (TODO: we no longer use
   * JGraphT, so this should be fixed)
   */
  public static SemanticGraph resetVerticeOrdering(SemanticGraph sg) {
    SemanticGraph nsg = new SemanticGraph();
    List<IndexedWord> vertices = sg.vertexListSorted();
    int index = 1;
    Map<IndexedWord, IndexedWord> oldToNewVertices = Generics.newHashMap();
    List<IndexedWord> newVertices = new ArrayList<>();
    for (IndexedWord vertex : vertices) {
      IndexedWord newVertex = new IndexedWord(vertex);
      newVertex.setIndex(index++);
      oldToNewVertices.put(vertex, newVertex);
      ///sg.removeVertex(vertex);
      newVertices.add(newVertex);
    }

    for (IndexedWord nv : newVertices) {
      nsg.addVertex(nv);
    }

    List<IndexedWord> newRoots = new ArrayList<>();
    for (IndexedWord or : sg.getRoots()) {
      newRoots.add(oldToNewVertices.get(or));
    }
    nsg.setRoots(newRoots);

    for (SemanticGraphEdge edge : sg.edgeIterable()) {
      IndexedWord newGov = oldToNewVertices.get(edge.getGovernor());
      IndexedWord newDep = oldToNewVertices.get(edge.getDependent());
      nsg.addEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra());
    }
    return nsg;
  }


  /**
   * Given a graph, ensures all edges are EnglishGrammaticalRelations
   * NOTE: this is English specific
   * NOTE: currently EnglishGrammaticalRelations does not link collapsed prep string forms
   * back to their object forms, for its valueOf relation.  This may need to be repaired if
   * generated edges indeed do have collapsed preps as strings.
   */
  public static void enRepairEdges(SemanticGraph sg, boolean verbose) {
    for (SemanticGraphEdge edge : sg.edgeIterable()) {
      if (edge.getRelation().isFromString()) {
        GrammaticalRelation newReln =
          EnglishGrammaticalRelations.valueOf(edge.getRelation().toString());
        if (newReln != null) {
          IndexedWord gov = edge.getGovernor();
          IndexedWord dep = edge.getDependent();
          double weight = edge.getWeight();
          boolean isExtra = edge.isExtra();
          sg.removeEdge(edge);
          sg.addEdge(gov, dep, newReln, weight, isExtra);
        } else {
          if (verbose)
            log.info("Warning, could not find matching GrammaticalRelation for reln="+edge.getRelation());
        }
      }
    }
  }

  public static void enRepairEdges(SemanticGraph sg) {
    enRepairEdges(sg, false);
  }

  /**
   * Deletes all nodes that are not rooted (such as dangling vertices after a series of
   * edges have been chopped).
   */
  public static void killNonRooted(SemanticGraph sg) {
    List<IndexedWord> nodes = new ArrayList<>(sg.vertexSet());

    // Hack: store all of the nodes we know are in the rootset
    Set<IndexedWord> guaranteed = Generics.newHashSet();
    for (IndexedWord root : sg.getRoots()) {
      guaranteed.add(root);
      guaranteed.addAll(sg.descendants(root));
    }

    for (IndexedWord node : nodes) {
      if (!guaranteed.contains(node)) {
        sg.removeVertex(node);
      }
    }
  }

  /**
   * Replaces a node in the given SemanticGraph with the new node,
   * replacing its position in the node edges.
   */
  public static void replaceNode(IndexedWord newNode, IndexedWord oldNode, SemanticGraph sg) {
    // Obtain the edges where the old node was the governor and the dependent.
    // Remove the old node, insert the new, and re-insert the edges.
    // Save the edges in a list so that remove operations don't affect
    // the iterator or our ability to find the edges in the first place
    List<SemanticGraphEdge> govEdges = sg.outgoingEdgeList(oldNode);
    List<SemanticGraphEdge> depEdges = sg.incomingEdgeList(oldNode);
    boolean oldNodeRemoved = sg.removeVertex(oldNode);
    if (oldNodeRemoved) {
      // If the new node is not present, be sure to add it in.
      if (!sg.containsVertex(newNode)) {
        sg.addVertex(newNode);
      }
      for (SemanticGraphEdge govEdge : govEdges) {
        sg.removeEdge(govEdge);
        sg.addEdge(newNode, govEdge.getDependent(), govEdge.getRelation(), govEdge.getWeight(), govEdge.isExtra());
      }
      for (SemanticGraphEdge depEdge : depEdges) {
        sg.removeEdge(depEdge);
        sg.addEdge(depEdge.getGovernor(), newNode, depEdge.getRelation(), depEdge.getWeight(), depEdge.isExtra());
      }
    } else {
      log.info("SemanticGraphUtils.replaceNode: previous node does not exist");
    }
  }

  public static final String WILDCARD_VERTICE_TOKEN = "WILDCARD";
  public static final IndexedWord WILDCARD_VERTICE = new IndexedWord();
  static {
    WILDCARD_VERTICE.setWord("*");
    WILDCARD_VERTICE.setValue("*");
    WILDCARD_VERTICE.setOriginalText("*");
  }

  /**
   * GIven an iterable set of distinct vertices, creates a new mapping that maps the
   * original vertices to a set of "generic" versions.  Used for generalizing tokens in discovered rules.
   * @param verts Vertices to anonymize
   * @param prefix Prefix to assign to this anonymization
   */
  public static Map<IndexedWord, IndexedWord> anonymyizeNodes(Iterable<IndexedWord> verts, String prefix) {
    Map<IndexedWord, IndexedWord> retMap = Generics.newHashMap();
    int index = 1;
    for (IndexedWord orig: verts) {
      IndexedWord genericVert = new IndexedWord(orig);
      genericVert.set(CoreAnnotations.LemmaAnnotation.class, "");
      String genericValue = prefix+index;
      genericVert.setValue(genericValue);
      genericVert.setWord(genericValue);
      genericVert.setOriginalText(genericValue);
      index++;
      retMap.put(orig, genericVert);
    }
    return retMap;
  }


  public static final String SHARED_NODE_ANON_PREFIX ="A";
  public static final String BLANKET_NODE_ANON_PREFIX ="B";

  /**
   * Used to make a mapping that lets you create "anonymous" versions of shared nodes between two
   * graphs (given in the arg) using the shared prefix.
   */
  public static Map<IndexedWord, IndexedWord> makeGenericVertices(Iterable<IndexedWord> verts) {
    return anonymyizeNodes(verts, SHARED_NODE_ANON_PREFIX);
  }

  /**
   * Used to assign generic labels to the nodes in the "blanket" for a set of vertices in a graph.  Here, a "blanket" node is
   * similar to nodes in a Markov Blanket, i.e. nodes that are one edge away from a set of asserted vertices in a
   * SemanticGraph.
   */
  public static Map<IndexedWord, IndexedWord> makeBlanketVertices(Iterable<IndexedWord> verts) {
    return anonymyizeNodes(verts, BLANKET_NODE_ANON_PREFIX);
  }


  /**
   * Given a set of edges, and a mapping between the replacement and target vertices that comprise the
   * vertices of the edges, returns a new set of edges with the replacement vertices.  If a replacement
   * is not present, the WILDCARD_VERTICE is used in its place (i.e. can be anything).
   *
   * Currently used to generate "generic" versions of Semantic Graphs, when given a list of generic
   * vertices to replace with, but can conceivably be used for other purposes where vertices must
   * be replaced.
   */
  public static List<SemanticGraphEdge> makeReplacedEdges(Iterable<SemanticGraphEdge> edges, Map<IndexedWord, IndexedWord> vertReplacementMap,
      boolean useGenericReplacement) {
    List<SemanticGraphEdge> retList = new ArrayList<>();
    for (SemanticGraphEdge edge : edges) {
      IndexedWord gov = edge.getGovernor();
      IndexedWord dep = edge.getDependent();
      IndexedWord newGov = vertReplacementMap.get(gov);
      IndexedWord newDep = vertReplacementMap.get(dep);
      if (useGenericReplacement) {
        if (newGov == null) {
          newGov = new IndexedWord(gov);
          newGov.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN);
          newGov.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN);
          newGov.set(CoreAnnotations.LemmaAnnotation.class, WILDCARD_VERTICE_TOKEN);
        }
        if (newDep == null) {
          newDep = new IndexedWord(dep);
          newDep.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN);
          newDep.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN);
          newDep.set(CoreAnnotations.LemmaAnnotation.class,WILDCARD_VERTICE_TOKEN);
        }
      } else {
        if (newGov == null)
          newGov = edge.getGovernor();
        if (newDep == null)
          newDep = edge.getDependent();
      }
      SemanticGraphEdge newEdge = new SemanticGraphEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra());
      retList.add(newEdge);
    }
    return retList;
  }

  /**
   * Given a set of vertices from the same graph, returns the set of all edges between these
   * vertices.
   */
  public static Set<SemanticGraphEdge> allEdgesInSet(Iterable<IndexedWord> vertices, SemanticGraph sg) {
    Set<SemanticGraphEdge> edges = Generics.newHashSet();
    for (IndexedWord v1 : vertices) {
      for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(v1)) {
        edges.add(edge);
      }
      for (SemanticGraphEdge edge : sg.incomingEdgeIterable(v1)) {
        edges.add(edge);
      }
    }
    return edges;
  }

  /**
   * Given two iterable sequences of edges, returns a pair containing the set of
   * edges in the first graph not in the second, and edges in the second not in the first.
   * Edge equality is determined using an object that implements ISemanticGraphEdgeEql.
   *
   */
  public static EdgeDiffResult diffEdges(Collection<SemanticGraphEdge> edges1, Collection<SemanticGraphEdge> edges2,
      SemanticGraph sg1, SemanticGraph sg2,
      ISemanticGraphEdgeEql compareObj) {
    Set<SemanticGraphEdge> remainingEdges1 = Generics.newHashSet();
    Set<SemanticGraphEdge> remainingEdges2 = Generics.newHashSet();
    Set<SemanticGraphEdge> sameEdges = Generics.newHashSet();


    ArrayList<SemanticGraphEdge> edges2Cache = new ArrayList<>(edges2);
    edge1Loop:
      for (SemanticGraphEdge edge1 : edges1) {
        for (SemanticGraphEdge edge2 : edges2Cache) {
          if (compareObj.equals(edge1, edge2, sg1, sg2)) {
            sameEdges.add(edge1);
            edges2Cache.remove(edge2);
            continue edge1Loop;
          }
        }
        remainingEdges1.add(edge1);
      }

    ArrayList<SemanticGraphEdge> edges1Cache = new ArrayList<>(edges1);
    edge2Loop:
      for (SemanticGraphEdge edge2 : edges2) {
        for (SemanticGraphEdge edge1 : edges1) {
          if (compareObj.equals(edge1, edge2, sg1, sg2)) {
            edges1Cache.remove(edge1);
            continue edge2Loop;
          }
        }
        remainingEdges2.add(edge2);
      }

    return new EdgeDiffResult(sameEdges, remainingEdges1, remainingEdges2);
  }

  public static class EdgeDiffResult {
    Set<SemanticGraphEdge> sameEdges;
    Set<SemanticGraphEdge> remaining1;
    Set<SemanticGraphEdge> remaining2;

    public EdgeDiffResult(Set<SemanticGraphEdge> sameEdges,
        Set<SemanticGraphEdge> remaining1,
        Set<SemanticGraphEdge> remaining2) {
      this.sameEdges = sameEdges;
      this.remaining1 = remaining1;
      this.remaining2 = remaining2;
    }

    public Set<SemanticGraphEdge> getRemaining1() {
      return remaining1;
    }

    public Set<SemanticGraphEdge> getRemaining2() {
      return remaining2;
    }

    public Set<SemanticGraphEdge> getSameEdges() {
      return sameEdges;
    }
  }


  /**
   * Pretty printers
   */
  public static String printEdges(Iterable<SemanticGraphEdge> edges) {
    StringWriter buf = new StringWriter();
    for (SemanticGraphEdge edge : edges) {
      buf.append("\t");
      buf.append(edge.getRelation().toString());
      buf.append("(");
      buf.append(edge.getGovernor().toString());
      buf.append(", ");
      buf.append(edge.getDependent().toString());
      buf.append(")\n");
    }
    return buf.toString();
  }

  public static class PrintVerticeParams {
    public boolean showWord = true;
    public boolean showIndex = true;
    public boolean showSentIndex = false;
    public boolean showPOS = false;
    public int wrapAt = 8;
  }

  public static String printVertices(SemanticGraph sg) {
    return printVertices(sg, new PrintVerticeParams());
  }

  public static String printVertices(SemanticGraph sg, PrintVerticeParams params) {
    StringWriter buf = new StringWriter();
    int count = 0;
    for (IndexedWord word : sg.vertexListSorted()) {
      count++;
      if (count % params.wrapAt == 0) { buf.write("\n\t"); }
      if (params.showIndex) {
        buf.write(String.valueOf(word.index()));
        buf.write(":");
      }
      if (params.showSentIndex) {
        buf.write("s");
        buf.write(String.valueOf(word.sentIndex()));
        buf.write("/");
      }
      if (params.showPOS) {
        buf.write(word.tag());
        buf.write("/");
      }

      if (params.showWord) {
        buf.write(word.word());
      }

      buf.write(" ");
    }
    return buf.toString();
  }


  /**
   * Given a SemanticGraph, creates a SemgrexPattern string based off of this graph.
   * NOTE: the word() value of the vertice is the name to reference
   * NOTE: currently presumes there is only one root in this graph.
   * TODO: see if Semgrex can allow multiroot patterns
   * @param sg SemanticGraph to base this pattern on.
   */
  public static String semgrexFromGraph(SemanticGraph sg, boolean matchTag, boolean matchWord,
      Map<IndexedWord, String> nodeNameMap) throws Exception {
    return semgrexFromGraph(sg, null, matchTag, matchWord, nodeNameMap);
  }

  public static String semgrexFromGraph(SemanticGraph sg, Collection<IndexedWord> wildcardNodes,
                                        boolean useTag, boolean useWord, Map<IndexedWord, String> nodeNameMap) throws Exception {
    Function<IndexedWord, String> transformNode = o ->{
      String str = "";
      if(useWord)
        str = "{word: /" + Pattern.quote(o.word()) + "/";
      if(useTag){
        if(!str.isEmpty())
          str += "; ";
        str = "tag: " + o.tag();
      }
      if(!str.isEmpty())
        str += "}";
      return str;
    };

      return semgrexFromGraph(sg, wildcardNodes, nodeNameMap, transformNode);
  }

  /**
   * nodeValuesTranformation is a function that converts a vertex (IndexedWord) to the value.
   * For an example, see {@code semgrexFromGraph}
   * function implementations (if useWord and useTag is true, the value is "{word: vertex.word; tag: vertex.tag}").
   * @throws Exception
   */
  public static String semgrexFromGraph(SemanticGraph sg, Collection<IndexedWord> wildcardNodes,
     Map<IndexedWord, String> nodeNameMap, Function<IndexedWord, String> wordTransformation) throws Exception {
    IndexedWord patternRoot = sg.getFirstRoot();
    StringWriter buf = new StringWriter();
    Set<IndexedWord> tabu = Generics.newHashSet();
    Set<SemanticGraphEdge> seenEdges = Generics.newHashSet();

    buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes,
      nodeNameMap, false, wordTransformation));

    String patternString = buf.toString();
    return patternString;
  }


  /**
   * Given a set of edges that form a rooted and connected graph, returns a Semgrex pattern
   * corresponding to it.
   * @throws Exception
   */
  public static String semgrexFromGraph(Iterable<SemanticGraphEdge> edges, boolean matchTag,
      boolean matchWord, Map<IndexedWord, String> nodeNameMap) throws Exception {
    SemanticGraph sg = SemanticGraphFactory.makeFromEdges(edges);
    return semgrexFromGraph(sg, matchTag, matchWord, nodeNameMap);
  }

  /**
   * Recursive call to generate the Semgrex pattern based off of this SemanticGraph.
   * nodeValuesTranformation is a function that converts a vertex (IndexedWord) to the value. For an example, see {@code semgrexFromGraph}
   * function implementations.
   */
  protected static String semgrexFromGraphHelper(IndexedWord vertice, SemanticGraph sg,
      Set<IndexedWord> tabu, Set<SemanticGraphEdge> seenEdges, boolean useWordAsLabel, boolean nameEdges, Collection<IndexedWord> wildcardNodes,
      Map<IndexedWord, String> nodeNameMap, boolean orderedNodes, Function<IndexedWord, String> nodeValuesTransformation) {
    StringWriter buf = new StringWriter();

    // If the node is a wildcarded one, treat it as a {}, meaning any match.  Currently these will not
    // be labeled, but this may change later.
    if (wildcardNodes != null && wildcardNodes.contains(vertice)) {
      buf.append("{}");
    } else {

      String vertexStr = nodeValuesTransformation.apply(vertice);
      if(vertexStr != null && !vertexStr.isEmpty()){
        buf.append(vertexStr);
      }
//      buf.append("{");
//      int i = 0;
//      for(String corekey: useNodeCoreAnnotations){
//        AnnotationLookup.KeyLookup lookup = AnnotationLookup.getCoreKey(corekey);
//        assert lookup != null : "Invalid key " + corekey;
//        if(i > 0)
//          buf.append("; ");
//        String value = vertice.containsKey(lookup.coreKey) ? vertice.get(lookup.coreKey).toString() : "null";
//        buf.append(corekey+":"+nodeValuesTransformation.apply(value));
//        i++;
//      }
//      if (useTag) {
//
//        buf.append("tag:"); buf.append(vertice.tag());
//        if (useWord)
//          buf.append(";");
//      }
//      if (useWord) {
//        buf.append("word:"); buf.append(wordTransformation.apply(vertice.word()));
//      }
//      buf.append("}");
    }
    if (nodeNameMap != null) {
      buf.append("=");
      buf.append(nodeNameMap.get(vertice));
      buf.append(" ");
    } else if (useWordAsLabel) {
      buf.append("=");
      buf.append(sanitizeForSemgrexName(vertice.word()));
      buf.append(" ");
    }

    tabu.add(vertice);

    Iterable<SemanticGraphEdge> edgeIter = null;
    if(!orderedNodes){
     edgeIter = sg.outgoingEdgeIterable(vertice);
    } else{
      edgeIter = CollectionUtils.sorted(sg.outgoingEdgeList(vertice), (arg0, arg1) ->
        (arg0.getRelation().toString().compareTo(arg1.getRelation().toString())));
    }


    // For each edge, record the edge, but do not traverse to the vertice if it is already in the
    // tabu list.  If it already is, we emit the edge and the target vertice, as
    // we will not be continuing in that vertex, but we wish to record the relation.
    // If we will proceed down that node, add parens if it will continue recursing down.
    for (SemanticGraphEdge edge : edgeIter) {
      seenEdges.add(edge);
      IndexedWord tgtVert = edge.getDependent();
      boolean applyParens =
        sg.outDegree(tgtVert) > 0 && !tabu.contains(tgtVert);
      buf.append(" >");
      buf.append(edge.getRelation().toString());
      if (nameEdges) {
        buf.append("=E");
        buf.write(String.valueOf(seenEdges.size()));
      }
      buf.append(" ");
      if (applyParens)
        buf.append("(");
      if (tabu.contains(tgtVert)) {
        buf.append("{tag:"); buf.append(tgtVert.tag()); buf.append("}");
        if (useWordAsLabel) {
          buf.append("=");
          buf.append(tgtVert.word());
          buf.append(" ");
        }
      } else {
        buf.append(semgrexFromGraphHelper(tgtVert, sg, tabu, seenEdges, useWordAsLabel, nameEdges,
            wildcardNodes, nodeNameMap, orderedNodes, nodeValuesTransformation));
        if (applyParens)
          buf.append(")");
      }
    }
    return buf.toString();
  }

  /** Same as semgrexFromGraph except the node traversal is ordered by sorting
   */
  public static String semgrexFromGraphOrderedNodes(SemanticGraph sg, Collection<IndexedWord> wildcardNodes,
      Map<IndexedWord, String> nodeNameMap, Function<IndexedWord, String> wordTransformation) throws Exception {
    IndexedWord patternRoot = sg.getFirstRoot();
    StringWriter buf = new StringWriter();
    Set<IndexedWord> tabu = Generics.newHashSet();
    Set<SemanticGraphEdge> seenEdges = Generics.newHashSet();

    buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes,
      nodeNameMap, true, wordTransformation));

    String patternString = buf.toString();
    return patternString;
  }



  /**
   * Sanitizes the given string into a Semgrex friendly name
   */
  public static String sanitizeForSemgrexName(String text) {
    text = text.replaceAll("\\.", "_DOT_");
    text = text.replaceAll("\\,", "_COMMA_");
    text = text.replaceAll("\\\\", "_BSLASH_");
    text = text.replaceAll("\\/", "_BSLASH_");
    text = text.replaceAll("\\?", "_QUES_");
    text = text.replaceAll("\\!", "_BANG_");
    text = text.replaceAll("\\$", "_DOL_");
    text = text.replaceAll("\\!", "_BANG_");
    text = text.replaceAll("\\&", "_AMP_");
    text = text.replaceAll("\\:", "_COL_");
    text = text.replaceAll("\\;", "_SCOL_");
    text = text.replaceAll("\\#", "_PND_");
    text = text.replaceAll("\\@", "_AND_");
    text = text.replaceAll("\\%", "_PER_");
    text = text.replaceAll("\\(","_LRB_");
    text = text.replaceAll("\\)", "_RRB_");
    return text;
  }


  /**
   * Given a {@code SemanticGraph}, sets the lemmas on its label
   * objects based on their word and tag.
   */
  public static void lemmatize(SemanticGraph sg) {
    for (IndexedWord node : sg.vertexSet()) {
      node.setLemma(Morphology.lemmaStatic(node.word(), node.tag()));
    }
  }

  /**
   * GIven a graph, returns a new graph with the the new sentence index enforced.
   * NOTE: new vertices are inserted.
   * TODO: is this ok?  rewrite this?
   */
  public static SemanticGraph setSentIndex(SemanticGraph sg, int newSentIndex) {
    SemanticGraph newGraph = new SemanticGraph(sg);
    List<IndexedWord> prevRoots = new ArrayList<>(newGraph.getRoots());
    List<IndexedWord> newRoots = new ArrayList<>();
    // TODO: we are using vertexListSorted here because we're changing
    // vertices while iterating.  Perhaps there is a better way to do it.
    for (IndexedWord node : newGraph.vertexListSorted()) {
      IndexedWord newWord = new IndexedWord(node);
      newWord.setSentIndex(newSentIndex);
      SemanticGraphUtils.replaceNode(newWord, node, newGraph);
      if (prevRoots.contains(node))
        newRoots.add(newWord);
    }
    newGraph.setRoots(newRoots);
    return newGraph;
  }

  //-----------------------------------------------------------------------------------------------
  //   Graph redundancy checks
  //-----------------------------------------------------------------------------------------------


  /**
   * Removes duplicate graphs from the set, using the string form of the graph
   * as the key (obviating issues with object equality).
   */
  public static Collection<SemanticGraph> removeDuplicates(Collection<SemanticGraph> graphs) {
    Map<String, SemanticGraph> map = Generics.newHashMap();
    for (SemanticGraph sg : graphs) {
      String keyVal = sg.toString().intern();
      map.put(keyVal, sg);
    }
    return map.values();
  }

  /**
   * Given the set of graphs to remove duplicates from, also removes those on the tabu graphs
   * (and does not include them in the return set).
   */
  public static Collection<SemanticGraph> removeDuplicates(Collection<SemanticGraph> graphs,
      Collection<SemanticGraph> tabuGraphs) {
    Map<String, SemanticGraph> tabuMap = Generics.newHashMap();
    for (SemanticGraph tabuSg : tabuGraphs) {
      String keyVal = tabuSg.toString().intern();
      tabuMap.put(keyVal, tabuSg);
    }
    Map<String, SemanticGraph> map = Generics.newHashMap();
    for (SemanticGraph sg : graphs) {
      String keyVal = sg.toString().intern();
      if (tabuMap.containsKey(keyVal))
        continue;
      map.put(keyVal, sg);
    }
    return map.values();
  }

  public static Collection<SemanticGraph> removeDuplicates(Collection<SemanticGraph> graphs,
      SemanticGraph tabuGraph) {
    Collection<SemanticGraph> tabuSet = Generics.newHashSet();
    tabuSet.add(tabuGraph);
    return removeDuplicates(graphs, tabuSet);
  }

  // -----------------------------------------------------------------------------------------------
  // Tree matching code
  // -----------------------------------------------------------------------------------------------

  /**
   * Given a CFG Tree parse, and the equivalent SemanticGraph derived from that Tree, generates a mapping
   * from each of the tree terminals to the best-guess SemanticGraph node(s).
   * This is performed using lexical matching, finding the nth match.
   * NOTE: not all tree nodes may match a Semgraph node, esp. for tokens removed in a collapsed Semgraph,
   * such as prepositions.
   */
  public static Map<PositionedTree, IndexedWord> mapTreeToSg(Tree tree, SemanticGraph sg) {
    // In order to keep track of positions, we store lists, in order encountered, of lex terms.
    // e.g. lexToTreeNode.get("the").get(2) should point to the same word as lexToSemNode.get("the").get(2)
    // Because IndexedWords may be collapsed together "A B" -> "A_B", we check the value of current(), and
    // split on whitespace if present.
    MapList<String, TreeNodeProxy> lexToTreeNode = new MapList<>();
    MapList<String, IndexedWordProxy> lexToSemNode = new MapList<>();

    for (Tree child : tree.getLeaves()) {
      List<TreeNodeProxy> leafProxies = TreeNodeProxy.create(child, tree);
      for (TreeNodeProxy proxy : leafProxies)
        lexToTreeNode.add(proxy.lex, proxy);
    }

    Map<IndexedWord, Integer> depthMap = Generics.newHashMap();
    for (IndexedWord node : sg.vertexSet()) {
      List<IndexedWord> path = sg.getPathToRoot(node);
      if (path != null)
        depthMap.put(node, path.size());
      else
        depthMap.put(node, 99999); // Use an arbitrarily deep depth value, to trick it into never being used.
      List<IndexedWordProxy> nodeProxies = IndexedWordProxy.create(node);
      for (IndexedWordProxy proxy : nodeProxies)
        lexToSemNode.add(proxy.lex, proxy);
    }

    // Now the map-lists (string->position encountered indices) are populated,
    // simply go through, finding matches.
    // NOTE: we use TreeNodeProxy instead of keying off of Tree, as
    // hash codes for Tree nodes do not consider position of the tree
    // within a tree: two subtrees with the same layout and child
    // labels will be equal.
    Map<PositionedTree, IndexedWord> map = Generics.newHashMap();
    for (String lex : lexToTreeNode.keySet()) {
      for (int i=0;i<lexToTreeNode.size(lex) && i<lexToSemNode.size(lex);i++) {
        map.put(new PositionedTree(lexToTreeNode.get(lex, i).treeNode, tree), lexToSemNode.get(lex,i).node);
      }
    }

    // Now that a terminals to terminals map has been generated, account for the
    // tree non-terminals.
    for (Tree nonTerm : tree) {
      if (!nonTerm.isLeaf()) {
        IndexedWord bestNode = null;
        int bestScore = 99999;
        for (Tree curr : nonTerm) {
          IndexedWord equivNode = map.get(new PositionedTree(curr, tree));
          if ((equivNode == null) || !depthMap.containsKey(equivNode)) continue;
          int currScore = depthMap.get(equivNode);
          if (currScore < bestScore) {
            bestScore = currScore;
            bestNode = equivNode;
          }
        }
        if (bestNode != null) {
          map.put(new PositionedTree(nonTerm, tree), bestNode);
        }
      }
    }

    return map;
  }

  /**
   * Private helper class for {@code mapTreeToSg}.   Acts to
   * map between a Tree node and a lexical value.
   * @author Eric Yeh
   *
   */
  private static class TreeNodeProxy {
    Tree treeNode;
    String lex;
    Tree root;

    public String toString() {
      return lex+" -> "+treeNode.toString()+", #="+treeNode.nodeNumber(root);
    }

    private TreeNodeProxy(Tree intree, String lex, Tree root) {
      this.treeNode = intree;
      this.lex = lex;
      this.root = root;
    }

    public static List<TreeNodeProxy> create(Tree intree, Tree root) {
      List<TreeNodeProxy> ret = new ArrayList<>();
      if (intree.isLeaf()) {
        ret.add(new TreeNodeProxy(intree, intree.label().value(), root));
      } else
        for (LabeledWord lword : intree.labeledYield()) {
          ret.add(new TreeNodeProxy(intree, lword.word(), root));
        }

      return ret;
    }
  }

  /**
   * This is used to uniquely index trees within a
   * Tree, maintaining the position of this subtree
   * within the context of the root.
   * @author Eric Yeh
   *
   */
  public static class PositionedTree {
    Tree tree;
    Tree root;
    int nodeNumber;

    public String toString() {
      return tree+"."+nodeNumber;
    }

    public PositionedTree(Tree tree, Tree root) {
      this.tree = tree;
      this.root = root;
      this.nodeNumber = tree.nodeNumber(root);
    }

    public boolean equals(Object obj) {
      if (obj instanceof PositionedTree) {
        PositionedTree tgt = (PositionedTree) obj;
        return tree.equals(tgt.tree) && root.equals(tgt.root) && tgt.nodeNumber == nodeNumber;
      }
      return false;
    }

    /**
     * TODO: verify this is correct
     */
    @Override
    public int hashCode() {
      int hc = tree.hashCode() ^ (root.hashCode() << 8);
      hc ^= (2 ^ nodeNumber);
      return hc;
    }
  }

  /**
   * Private helper class for {@code mapTreeToSg}.  Acts to
   * map between an IndexedWord (in a SemanticGraph) and a lexical value.
   * @author lumberjack
   *
   */
  private static final class IndexedWordProxy {
    IndexedWord node;
    String lex;

    public String toString() {
      return lex+" -> "+node.word()+":"+node.sentIndex()+"."+node.index();
    }

    private IndexedWordProxy(IndexedWord node, String lex) {
      this.node = node; this.lex = lex;
    }

    /**
     * Generates a set of IndexedWordProxy objects.  If the current() field is present, splits the tokens by
     * a space, and for each, creates a new IndexedWordProxy, in order encountered, referencing this current
     * node, but using the lexical value of the current split token.  Otherwise just use the value of word().
     * This is used to retain attribution to the originating node.
     */
    public static List<IndexedWordProxy> create(IndexedWord node) {
      List<IndexedWordProxy> ret = new ArrayList<>();
      if (node.originalText().length() > 0) {
        for (String token : node.originalText().split(" ")) {
          ret.add(new IndexedWordProxy(node, token));
        }
      } else {
        ret.add(new IndexedWordProxy(node, node.word()));
      }
      return ret;
    }
  }
}