/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.utils;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Hashtable;
import java.util.List;
import org.apache.log4j.Logger;
import tml.annotators.PennTreeAnnotator;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.PennTreeReader;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.tregex.ParseException;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
/**
* Class to consolidate the calls to the Stanford parser
* @author Jorge Villalon
*
*/
public class StanfordUtils {
private static Logger logger = Logger.getLogger(StanfordUtils.class);
private static GrammaticalStructureFactory factory = null;
private static GrammaticalStructureFactory getGrammaticalStructureFactory() throws IOException {
if(factory == null) {
factory = PennTreeAnnotator.getGrammaticalStructureFactory();
}
return factory;
}
private static Hashtable<String, Tree> pennTreeCache = new Hashtable<String, Tree>();
/**
* @param t a grammar tree to extract the verbs
* @return a list of verbs in the tree, an empty list if nothing is found.
*/
public static List<String> extractVerbs(Tree t) {
List<String> verbs = new ArrayList<String>();
if(t == null)
return verbs;
TregexPattern pattern = null;
try {
pattern = TregexPattern
.compile("/VB.?/");
TregexMatcher matcher = pattern.matcher(t);
while (matcher.findNextMatchingNode()) {
String content = cleanNodeContent(nodeContent(matcher.getMatch()));
if(content.trim().length()>0)
verbs.add(content);
}
} catch (ParseException e) {
logger.error(e);
}
return verbs;
}
/**
* @param t the tree to which extract the content
* @return the string with the content of the tree
*/
public static String nodeContent(Tree t, Tree pv) {
if(t.isLeaf())
return t.value();
StringBuffer buff = new StringBuffer();
for(Tree tt : t.children()) {
if(!t.value().equals("DT") &&
!t.value().equals("SYM") &&
!t.value().startsWith("PRP")) {
buff.append(nodeContent(tt, t));
buff.append(" ");
}
}
String clean = buff.toString().replace("\\s+", " ").trim();
return clean;
}
/**
* @param t the grammar tree
* @return a list with all concepts identified in a tree
*/
public static List<String> extractNouns(Tree t) {
List<String> concepts = new ArrayList<String>();
if(t == null)
return concepts;
TregexPattern pattern;
try {
// This pattern means a noun phrase that is not dominating another
// noun phrase, and is also not dominating a verbal phrase
pattern = TregexPattern.compile("@NP !<< NP & !<<@VP");
TregexMatcher matcher = pattern.matcher(t);
while (matcher.findNextMatchingNode()) {
String content = cleanNodeContent(nodeContent(matcher.getMatch(), null));
if(content.trim().length() > 0)
concepts.add(content);
}
} catch (ParseException e) {
logger.error(e);
}
return concepts;
}
/**
* Added to remove punctuation from the strings extracted from the tree
* @param content any string containing punctuation at beginning or end
* @return the string without trailing or tailing punctuation
*/
public static String cleanNodeContent(String content) {
String cleanContent = content.trim();
cleanContent = cleanContent.replaceFirst("^\\W+", "");
cleanContent = cleanContent.replaceFirst("\\W+$", "");
cleanContent = cleanContent.replaceAll("\\s+", " ");
return cleanContent.trim();
}
/**
* @param t the tree to which extract the content
* @return the string with the content of the tree
*/
public static String nodeContent(Tree t) {
if(t.isLeaf())
return t.value();
StringBuffer buff = new StringBuffer();
for(Tree tt : t.children()) {
buff.append(nodeContent(tt));
buff.append(" ");
}
String clean = buff.toString().replace("\\s+", " ").trim();
return clean;
}
/**
* Calculates a Penn grammatical tree from its string representation
* @param pennTreeString the string
* @return the grammar tree
* @throws Exception
*/
public static Tree getTreeFromString(String passageId, String pennTreeString) {
double time = System.nanoTime();
time = System.nanoTime() - time;
Tree t = null;
if(pennTreeCache.containsKey(passageId)) {
t = pennTreeCache.get(passageId);
} else {
LabeledScoredTreeFactory tf = new LabeledScoredTreeFactory();
PennTreeReader reader = new PennTreeReader(new StringReader(pennTreeString), tf);
try {
t = reader.readTree();
pennTreeCache.put(passageId, t);
} catch (IOException e) {
logger.error("Error parsing penntree string length " + pennTreeString.length());
e.printStackTrace();
return null;
}
}
logger.debug("PennTree calculated in " + time * 10E-6 + " milliseconds.");
return t;
}
/**
* Calculates the typed dependencies from a grammatical tree
* @param tree the grammatical tree
*/
public static List<String> calculateTypedDependencies(Tree tree) {
double time = System.nanoTime();
List<String> output = new ArrayList<String>();
GrammaticalStructure gs = null;
try {
gs = getGrammaticalStructureFactory().newGrammaticalStructure(tree);
} catch (Exception e) {
logger.error(e);
return null;
}
Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
// Get the POS tag from each word
Hashtable<String, String> posInfo = new Hashtable<String, String>();
for(Tree t: tree.getLeaves()) {
Tree pt = null;
for(Tree tt : tree.dominationPath(t)) {
if(tt.isLeaf()) {
posInfo.put(tt.nodeString(), pt.nodeString());
}
pt = tt;
}
}
for (Object obj : tdl.toArray()) {
TypedDependency dep = (TypedDependency) obj;
String wordGov = dep.gov().nodeString().split("-")[0];
String wordDep = dep.dep().nodeString().split("-")[0];
String posGov = posInfo.get(wordGov);
String posDep = posInfo.get(wordDep);
String dependencyString = dep.reln().toString() + "(" + dep.gov().pennString().trim() + "-" + posGov + ", " + dep.dep().pennString().trim() + "-" + posDep + ")";
output.add(dependencyString);
}
time = System.nanoTime() - time;
logger.debug("Typed dependencies obtained in " + time * 10E-6 + " milliseconds");
return output;
}
public static String removeDeterminersFromNounPhrase(String phrase) throws IOException {
Tree tree = getPennTree(phrase);
return nodeContent(tree,null);
}
public static String getPennString(Tree tree) {
String pennTreeString = "";
double time = System.nanoTime();
TreePrint print = new TreePrint("penn");
StringWriter stw = new StringWriter();
print.printTree(tree, new PrintWriter(stw));
pennTreeString = stw.toString();
time = (System.nanoTime() - time) * 10E-9;
logger.debug("Sentence parsed in " + time + " seconds");
return pennTreeString;
}
public static Tree getPennTree(String text) throws IOException {
// text = text.trim();
// text = text.replaceAll("\"", "");
// if(text.endsWith("."))
// text = text.substring(0, text.length()-1);
// String[] sentenceWords = text.split("\\s+");
// Tree tree = PennTreeAnnotator.getParser().apply(Arrays
// .asList(sentenceWords));
Tree tree = PennTreeAnnotator.getParser().apply(text);
return tree;
}
public static String getPennTagMinimalPhrase(Tree t) {
if(t.isLeaf())
return "LEAF";
if(t.isPrePreTerminal())
return t.value();
return getPennTagMinimalPhrase(t.children()[0]);
}
public static String getPennTagFirstBranch(Tree orig, Tree t, Tree pt) {
if(t.isLeaf())
return "NOBRANCH";
List<Tree> trees = t.siblings(orig);
if(trees != null && trees.size() > 0 && pt != null)
return pt.value();
return getPennTagFirstBranch(orig, t.getChild(0), t);
}
}