package edu.stanford.nlp.trees.ud;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.EnglishPatterns;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.MemoryTreebank;
import edu.stanford.nlp.trees.NPTmpRetainingTreeNormalizer;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.trees.UniversalPOSMapper;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.logging.Redwood;
/**
*
* Adds lemmata and features to an English CoNLL-U dependencies
* treebank.
*
* @author Sebastian Schuster
*
*/
public class UniversalDependenciesFeatureAnnotator {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(UniversalDependenciesFeatureAnnotator.class);
private static final String FEATURE_MAP_FILE = "edu/stanford/nlp/models/ud/feature_map.txt";
private HashMap<String,HashMap<String,String>> posFeatureMap;
private HashMap<String,HashMap<String,String>> wordPosFeatureMap;
private final Morphology morphology = new Morphology();
public UniversalDependenciesFeatureAnnotator() throws IOException {
loadFeatureMap();
}
private void loadFeatureMap() throws IOException {
Reader r = IOUtils.readerFromString(FEATURE_MAP_FILE);
BufferedReader br = new BufferedReader(r);
posFeatureMap = new HashMap<>();
wordPosFeatureMap = new HashMap<>();
String line;
while ((line = br.readLine()) != null) {
String[] parts = line.split("\\s+");
if (parts.length < 3) continue;
if (parts[0].equals("*")) {
posFeatureMap.put(parts[1], CoNLLUUtils.parseFeatures(parts[2]));
} else {
wordPosFeatureMap.put(parts[0] + '_' + parts[1], CoNLLUUtils.parseFeatures(parts[2]));
}
}
}
private HashMap<String,String> getPOSFeatures(String word, String pos) {
HashMap<String, String> features = new HashMap<>();
String wordPos = word.toLowerCase() + '_' + pos;
if (wordPosFeatureMap.containsKey(wordPos)) {
features.putAll(wordPosFeatureMap.get(wordPos));
} else if (posFeatureMap.containsKey(pos)) {
features.putAll(posFeatureMap.get(pos));
}
if (isOrdinal(word, pos)) {
features.put("NumType", "Ord");
}
if (isMultiplicative(word, pos)) {
features.put("NumType", "Mult");
}
return features;
}
private static final String ORDINAL_EXPRESSION = "^(first|second|third|fourth|fifth|sixth|seventh|eigth|ninth|tenth|([0-9,.]+(th|st|nd|rd)))$";
private static final String MULTIPLICATIVE_EXPRESSION = "^(once|twice)$";
private static boolean isOrdinal(String word, String pos) {
if ( ! pos.equals("JJ"))
return false;
return word.toLowerCase().matches(ORDINAL_EXPRESSION);
}
private static boolean isMultiplicative(String word, String pos) {
if ( ! pos.equals("RB"))
return false;
return word.toLowerCase().matches(MULTIPLICATIVE_EXPRESSION);
}
private static String SELF_REGEX = EnglishPatterns.selfRegex.replace("/", "");
private static HashMap<String, String> getGraphFeatures(SemanticGraph sg, IndexedWord word) {
HashMap<String, String> features = new HashMap<>();
/* Determine the case of "you". */
if (word.tag().equals("PRP") &&
(word.value().equalsIgnoreCase("you") ||
word.value().equalsIgnoreCase("it"))) {
features.put("Case", pronounCase(sg, word));
}
/* Determine the person of "was". */
if (word.tag().equals("VBD") &&
word.value().equalsIgnoreCase("was")) {
String person = wasPerson(sg, word);
if (person != null) {
features.put("Person", person);
}
}
/* Determine features of relative and interrogative pronouns. */
features.putAll(getRelAndIntPronFeatures(sg, word));
/* Determine features of gerunds and present participles. */
if (word.tag().equals("VBG")) {
if (hasBeAux(sg, word)) {
features.put("VerbForm", "Part");
features.put("Tense", "Pres");
} else {
features.put("VerbForm", "Ger");
}
}
/* Determine whether reflexive pronoun is reflexive or intensive. */
if (word.value().matches(SELF_REGEX) && word.tag().equals("PRP")) {
IndexedWord parent = sg.getParent(word);
if (parent != null) {
SemanticGraphEdge edge = sg.getEdge(parent, word);
if (edge.getRelation() != UniversalEnglishGrammaticalRelations.NP_ADVERBIAL_MODIFIER) {
features.put("Case", "Acc");
features.put("Reflex", "Yes");
}
}
}
/* Voice feature. */
if (word.tag().equals("VBN")) {
if (sg.hasChildWithReln(word, UniversalEnglishGrammaticalRelations.AUX_PASSIVE_MODIFIER)) {
features.put("Voice", "Pass");
}
}
return features;
}
/**
* Determine the case of the pronoun "you" or "it".
*/
private static String pronounCase(SemanticGraph sg, IndexedWord word) {
word = sg.getNodeByIndex(word.index());
IndexedWord parent = sg.getParent(word);
if (parent != null) {
SemanticGraphEdge edge = sg.getEdge(parent, word);
if (edge != null) {
if (UniversalEnglishGrammaticalRelations.OBJECT.isAncestor(edge.getRelation())) {
/* "you" is an object. */
return "Acc";
} else if (UniversalEnglishGrammaticalRelations.NOMINAL_MODIFIER.isAncestor(edge.getRelation())
|| edge.getRelation() == GrammaticalRelation.ROOT) {
if (sg.hasChildWithReln(word, UniversalEnglishGrammaticalRelations.CASE_MARKER)) {
/* "you" is the head of a prepositional phrase. */
return "Acc";
}
}
}
}
return "Nom";
}
/**
* Determine the person of "was".
*/
private static String wasPerson(SemanticGraph sg, IndexedWord word) {
IndexedWord subj = sg.getChildWithReln(word, UniversalEnglishGrammaticalRelations.NOMINAL_SUBJECT);
if (subj == null) {
subj = sg.getChildWithReln(word, UniversalEnglishGrammaticalRelations.NOMINAL_PASSIVE_SUBJECT);
}
if (subj != null) {
if (subj.word().equalsIgnoreCase("i")) {
/* "I" is the subject of "was". */
return "1";
}
}
IndexedWord parent = sg.getParent(word);
if (parent == null) {
return subj != null ? "3" : null;
}
SemanticGraphEdge edge = sg.getEdge(parent, word);
if (edge == null) {
return subj != null ? "3" : null;
}
if (UniversalEnglishGrammaticalRelations.AUX_MODIFIER.equals(edge.getRelation()) ||
UniversalEnglishGrammaticalRelations.AUX_PASSIVE_MODIFIER.equals(edge.getRelation())) {
return wasPerson(sg, parent);
}
if (UniversalEnglishGrammaticalRelations.CONJUNCT.isAncestor(edge.getRelation())) {
/* Check if the subject of the head of a conjunction is "I". */
return wasPerson(sg, parent);
}
return "3";
}
/**
* Extracts features from relative and interrogative pronouns.
*/
private static HashMap<String, String> getRelAndIntPronFeatures(SemanticGraph sg, IndexedWord word) {
HashMap<String, String> features = new HashMap<>();
if (word.tag().startsWith("W")) {
boolean isRel = false;
IndexedWord parent = sg.getParent(word);
if (parent != null) {
IndexedWord parentParent = sg.getParent(parent);
if (parentParent != null) {
SemanticGraphEdge edge = sg.getEdge(parentParent, parent);
isRel = edge.getRelation().equals(UniversalEnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER);
}
}
if (isRel) {
features.put("PronType", "Rel");
} else {
if (word.value().equalsIgnoreCase("that")) {
features.put("PronType", "Dem");
} else {
features.put("PronType", "Int");
}
}
}
return features;
}
private static Iterator<Tree> treebankIterator(String path) {
/* Remove empty nodes and strip indices from internal nodes but keep
functional tags. */
Treebank tb = new MemoryTreebank(new NPTmpRetainingTreeNormalizer(0, false, 1, false));
tb.loadPath(path);
return tb.iterator();
}
private static TregexPattern IMPERATIVE_PATTERN = TregexPattern.compile("__ > VB >+(/^[^S]/) S-IMP");
/**
* Returns the indices of all imperative verbs in the
* tree t.
*
*/
private static Set<Integer> getImperatives(Tree t) {
Set<Integer> imps = new HashSet<>();
TregexMatcher matcher = IMPERATIVE_PATTERN.matcher(t);
while (matcher.find()) {
List<Label> verbs = matcher.getMatch().yield();
CoreLabel cl = (CoreLabel) verbs.get(0);
imps.add(cl.index());
}
return imps;
}
/**
* Returns true if {@code word} has an auxiliary verb attached to it.
*
*/
@SuppressWarnings("unused")
private static boolean hasAux(SemanticGraph sg, IndexedWord word) {
if (sg.hasChildWithReln(word, UniversalEnglishGrammaticalRelations.AUX_MODIFIER)) {
return true;
}
IndexedWord gov = sg.getParent(word);
if (gov != null) {
SemanticGraphEdge edge = sg.getEdge(gov, word);
if (UniversalEnglishGrammaticalRelations.CONJUNCT.isAncestor(edge.getRelation()) ||
UniversalEnglishGrammaticalRelations.COPULA.equals(edge.getRelation())) {
return hasAux(sg, gov);
}
}
return false;
}
/**
* Returns true if {@code word} has an infinitival "to" attached to it.
*/
@SuppressWarnings("unused")
private static boolean hasTo(SemanticGraph sg, IndexedWord word) {
/* Check for infinitival to. */
if (sg.hasChildWithReln(word, UniversalEnglishGrammaticalRelations.MARKER)) {
for (IndexedWord marker : sg.getChildrenWithReln(word, UniversalEnglishGrammaticalRelations.MARKER)) {
if (marker.value().equalsIgnoreCase("to")) {
return true;
}
}
}
return false;
}
private static String BE_REGEX = EnglishPatterns.beAuxiliaryRegex.replace("/", "");
/**
* Returns true if {@code word} has an inflection of "be" as an auxiliary.
*/
private static boolean hasBeAux(SemanticGraph sg, IndexedWord word) {
for (IndexedWord aux : sg.getChildrenWithReln(word, UniversalEnglishGrammaticalRelations.AUX_MODIFIER)) {
if (aux.value().matches(BE_REGEX)) {
return true;
}
}
/* Check if head of conjunction has an auxiliary in case the word is part of a conjunction */
IndexedWord gov = sg.getParent(word);
if (gov != null) {
SemanticGraphEdge edge = sg.getEdge(gov, word);
if (UniversalEnglishGrammaticalRelations.CONJUNCT.isAncestor(edge.getRelation())) {
return hasBeAux(sg, gov);
}
}
return false;
}
public void addFeatures(SemanticGraph sg, Tree t, boolean addLemma, boolean addUPOS) {
Set<Integer> imperatives = t != null ? getImperatives(t) : new HashSet<>();
for (IndexedWord word : sg.vertexListSorted()) {
String posTag = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String token = word.get(CoreAnnotations.TextAnnotation.class);
Integer index = word.get(CoreAnnotations.IndexAnnotation.class);
HashMap<String, String> wordFeatures = word.get(CoreAnnotations.CoNLLUFeats.class);
if (wordFeatures == null) {
wordFeatures = new HashMap<>();
word.set(CoreAnnotations.CoNLLUFeats.class, wordFeatures);
}
/* Features that only depend on the word and the PTB POS tag. */
wordFeatures.putAll(getPOSFeatures(token, posTag));
/* Semantic graph features. */
wordFeatures.putAll(getGraphFeatures(sg, word));
/* Handle VBs. */
if (imperatives.contains(index)) {
/* Imperative */
wordFeatures.put("VerbForm", "Fin");
wordFeatures.put("Mood", "Imp");
} else if (posTag.equals("VB")) {
/* Infinitive */
wordFeatures.put("VerbForm", "Inf");
/* Subjunctive detection too unreliable. */
//} else {
// /* Present subjunctive */
// wordFeatures.put("VerbForm", "Fin");
// wordFeatures.put("Tense", "Pres");
// wordFeatures.put("Mood", "Subj");
//}
}
String lemma = word.get(CoreAnnotations.LemmaAnnotation.class);
if (addLemma && (lemma == null || lemma.equals("_"))) {
word.set(CoreAnnotations.LemmaAnnotation.class, morphology.lemma(token, posTag));
}
}
if (addUPOS && t != null) {
t = UniversalPOSMapper.mapTree(t);
List<Label> uPOSTags = t.preTerminalYield();
List<IndexedWord> yield = sg.vertexListSorted();
int len = yield.size();
for (IndexedWord word : yield) {
Label uPOSTag = uPOSTags.get(word.index() - 1);
word.set(CoreAnnotations.CoarseTagAnnotation.class, uPOSTag.value());
}
}
}
public static void main(String[] args) throws IOException {
if (args.length < 2) {
log.info("Usage: ");
log.info("java ");
log.info(UniversalDependenciesFeatureAnnotator.class.getCanonicalName());
log.info(" CoNLL-U_file tree_file [-addUPOS -escapeParenthesis]");
return;
}
String coNLLUFile = args[0];
String treeFile = args[1];
boolean addUPOS = false;
boolean escapeParens = false;
for (int i = 2; i < args.length; i++) {
if (args[i].equals("-addUPOS")) {
addUPOS = true;
} else if (args[i].equals("-escapeParenthesis")) {
escapeParens = true;
}
}
UniversalDependenciesFeatureAnnotator featureAnnotator = new UniversalDependenciesFeatureAnnotator();
Reader r = IOUtils.readerFromString(coNLLUFile);
CoNLLUDocumentReader depReader = new CoNLLUDocumentReader();
CoNLLUDocumentWriter depWriter = new CoNLLUDocumentWriter();
Iterator<SemanticGraph> it = depReader.getIterator(r);
Iterator<Tree> treeIt = treebankIterator(treeFile);
while (it.hasNext()) {
SemanticGraph sg = it.next();
Tree t = treeIt.next();
if (t == null || t.yield().size() != sg.size()) {
StringBuilder sentenceSb = new StringBuilder();
for (IndexedWord word : sg.vertexListSorted()) {
sentenceSb.append(word.get(CoreAnnotations.TextAnnotation.class));
sentenceSb.append(' ');
}
throw new RuntimeException("CoNLL-U file and tree file are not aligned. \n"
+ "Sentence: " + sentenceSb + '\n'
+ "Tree: " + t.pennString());
}
featureAnnotator.addFeatures(sg, t, true, addUPOS);
System.out.print(depWriter.printSemanticGraph(sg, !escapeParens));
}
}
}