package edu.uncc.cs.watsonsim.nlp; import java.util.ArrayList; import java.util.List; import java.util.Properties; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.util.CoreMap; //import edu.stanford.nlp.util.*; /* @author Wlodek * @author Sean Gallagher * * Namespace for the preconfigured NLP pipeline */ public class Trees { static final StanfordCoreNLP pipeline; static { // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, parse"); props.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz"); pipeline = new StanfordCoreNLP(props); } public static List<CoreMap> parse(String text) { // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); List<Tree> trees = new ArrayList<>(); List<Tree> dependencies = new ArrayList<>(); for(CoreMap sentence: sentences) { // this is the parse tree of the current sentence Tree t = sentence.get(TreeAnnotation.class); SemanticGraph graph = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); trees.add(t); } return sentences; } /** * Concatenate the leaves of a parse tree, interspersed with spaces * @param t The input tree * @return The resulting space-delimited string */ public static String concat(Tree t) { StringBuilder b = new StringBuilder(""); for (Tree l : t.getLeaves()) { b.append(l.value()); b.append(' '); } b.deleteCharAt(b.length()-1); return b.toString(); } /** * Most of the time it's more helpful to know the more specific grammatical * relations like "prep_of" than it is to bridge two words across an "of". * @param A more specific relation, if one is known. Otherwise it's just * the result of .getRelation() */ public static String getSpecificPreps(GrammaticalRelation rel) { String rel_name = rel.getShortName(); if (rel.getSpecific() != null) { rel_name += "_" + rel.getSpecific(); } return rel_name; } /** * The bare one-word nouns are usually not very good. * "group", "set" and such are especially bad as they are basically just * container types: "group of diseases", "set of rules" * So we concat a few kinds of links to nouns. */ public static String concatNoun(SemanticGraph graph, IndexedWord rightmost) { if (rightmost.tag().startsWith("NN")) { StringBuilder phrase = new StringBuilder(); // Only actually build on nouns for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(rightmost)) { switch (edge.getRelation().getShortName()) { case "nn": case "cd": //case "amod": phrase.append(edge.getDependent().lemma()); phrase.append(' '); break; case "prep": if (getSpecificPreps(edge.getRelation()) .equals("prep_of")) { phrase.append(edge.getDependent().lemma()); phrase.append(' '); } break; } } return phrase.append(rightmost.originalText()).toString(); } else { return rightmost.originalText(); } } }