package edu.uncc.cs.watsonsim;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.stream.Stream;
import static java.util.stream.Collectors.toList;
import java.util.Arrays;
import org.apache.commons.lang3.StringEscapeUtils;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import static edu.stanford.nlp.util.Pair.makePair;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
/**
* A String, tokenized, parsed into Trees, and as a semantic graph.
*
*/
public class Phrase {
public final String text;
private static final Cache<String, Phrase> recent;
// Cached Fields
private transient ConcurrentHashMap<Function<? extends Phrase, ?>, Object> memos;
public transient Log log = Log.NIL;
// Create a pipeline
private static final StanfordCoreNLP pipeline;
private static final StanfordCoreNLP constituency_parse_pipeline;
static {
pipeline = makeCoreNLPPipeline("tokenize, cleanxml, ssplit, pos, lemma, parse");
constituency_parse_pipeline = makeCoreNLPPipeline("tokenize, cleanxml, ssplit, pos, lemma, parse");
// Save time by caching some, but not too many, recent parses.
recent = CacheBuilder.newBuilder()
.concurrencyLevel(50)
.maximumSize(10000)
.weakValues()
.build();
}
/** We still need to use pipelines from other systems. So we make them
* statically and use them elsewhere.
* @param annotators
* @return
*/
private static StanfordCoreNLP makeCoreNLPPipeline(String annotators) {
// Creates an NLP pipeline missing ner, and dcoref
Properties props = new Properties();
props.put("annotators", annotators);
// Use the faster parsing but slower loading shift-reduce models
props.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz");
// When you find something untokenizable, delete it and don't whine
props.put("tokenize.options", "untokenizable=noneDelete");
return new StanfordCoreNLP(props);
}
/**
* This no-args constructor exists solely for deserialization
*/
private Phrase() {
text = "";
memos = new ConcurrentHashMap<>();
log = Log.NIL;
}
/**
* Create a new NLP parsed phrase.
* This will throw an NPE rather than take null text.
* The memo system is lazy and phrases are cached so this is quite cheap.
*/
public Phrase(String text) {
if (text == null)
throw new NullPointerException("Text cannot be null.");
Phrase cache_entry = recent.getIfPresent(text);
if (cache_entry != null) {
this.text = cache_entry.text;
// Memos are mutable but private and thread-safe.
this.memos = cache_entry.memos;
} else {
this.memos = new ConcurrentHashMap<>();
this.text = StringEscapeUtils.unescapeXml(text);
recent.put(text, this);
}
}
/**
* Lightweight functional annotations. Either apply the function and get
* the result, or if it has been done, return the existing value.
*
* Here's the cute part: You to annotate recursively to make pipelines.
*
* There are caveats: You need to be sure your function input type matches
* the type you are annotating or you will get runtime errors. The output
* types, however, are compile time type checked. This is fixable but makes
* the API uglier so we don't enforce it.
* Also, if your annotator returns null, the result will not be cached. So
* if your annotator is expensive, return some singleton instead.
*/
@SuppressWarnings("unchecked")
public <X, T extends Phrase> X memo(Function<T, X> app) {
/*
* Atomicity is not necessary here because the functions are
* idempotent. Enforcing atomicity can cause a deadlock, because
* memo() needs to be reentrant. Instead, just allow duplicate put()'s
*/
X output = (X) memos.get(app);
if (output == null)
output = app.apply((T) this);
if (output != null)
memos.put(app, output);
return output;
}
/*
* Convenience functions for common annotations
*/
private static final Function<Phrase, Annotation> coreNLP = Phrase::_coreNLP;
private static Annotation _coreNLP(Phrase p) {
// create an empty Annotation just with the given text
Annotation document = new Annotation(p.text);
try{
// run all Annotators on this text
pipeline.annotate(document);
} catch (IllegalArgumentException | NullPointerException ex) {
/*
* On extremely rare occasions (< 0.00000593% of passages)
* it will throw an error like the following:
*
* Exception in thread "main" java.lang.IllegalArgumentException:
* No head rule defined for SYM using class edu.stanford.nlp.trees.SemanticHeadFinder in SYM-10
*
* On more frequent occasions, you get the following:
* Exception in thread "main" java.lang.NullPointerException
* at edu.stanford.nlp.dcoref.RuleBasedCorefMentionFinder.findHead(RuleBasedCorefMentionFinder.java:276)
*
* Both of these are fatal for the passage.
* Neither are a big deal for the index. Forget them.
*/
}
return document;
}
/**
* Return CoreNLP sentences.
* Never returns null, only empty collections.
*/
private static final Function<Phrase, List<CoreMap>> sentences = Phrase::_sentences;
private static List<CoreMap> _sentences(Phrase p) {
return Optional.ofNullable(
p.memo(Phrase.coreNLP)
.get(SentencesAnnotation.class))
.orElse(Collections.emptyList());
}
/**
* Return CoreNLP constituency trees
*/
public static final Function<Phrase, List<Tree>> trees = Phrase::_trees;
private static List<Tree> _trees(Phrase p) {
// create an empty Annotation just with the given text
Annotation document = p.memo(Phrase.coreNLP);
try{
// Run the full parse on this text
constituency_parse_pipeline.annotate(document);
} catch (IllegalArgumentException | NullPointerException ex) {
/*
* On extremely rare occasions (< 0.00000593% of passages)
* it will throw an error like the following:
*
* Exception in thread "main" java.lang.IllegalArgumentException:
* No head rule defined for SYM using class edu.stanford.nlp.trees.SemanticHeadFinder in SYM-10
*
* On more frequent occasions, you get the following:
* Exception in thread "main" java.lang.NullPointerException
* at edu.stanford.nlp.dcoref.RuleBasedCorefMentionFinder.findHead(RuleBasedCorefMentionFinder.java:276)
*
* Both of these are fatal for the passage.
* Neither are a big deal for the index. Forget them.
*/
}
return p.memo(Phrase.sentences)
.stream()
.map(s -> s.get(TreeAnnotation.class))
.filter(Objects::nonNull)
.collect(toList());
}
/**
* Return Lucene tokens
*/
public static Function<Phrase, List<String>> tokens = Phrase::_tokens;
private static List<String> _tokens(Phrase p) {
return StringUtils.tokenize(p.text);
}
/**
* Return very lightly processed tokens.
* TODO: Imitate the token processing in Glove
*/
public static Function<Phrase, List<String>> simpleTokens = Phrase::_simpleTokens;
private static List<String> _simpleTokens(Phrase p) {
return Arrays.asList(p.text.split("\\W+"));
}
/**
* Return CoreNLP dependency trees
*/
public static final Function<Phrase, List<SemanticGraph>> graphs = Phrase::_graphs;
private static List<SemanticGraph> _graphs(Phrase p) {
return p.memo(Phrase.sentences)
.stream()
.map(s -> s.get(CollapsedCCProcessedDependenciesAnnotation.class))
.filter(Objects::nonNull)
.collect(toList());
}
/**
* Annotation for lemmatized tokens
*/
public static final Function<Phrase, List<String>> lemmas = Phrase::_lemmas;
private static List<String> _lemmas(Phrase p) {
return p.memo(Phrase.sentences)
.stream()
.flatMap(s -> s.get(TokensAnnotation.class).stream())
.map( t -> t.get(LemmaAnnotation.class))
.collect(toList());
}
/**
* Get a map for finding the main mention of any Coref
*/
public static final Function<Phrase, Map<Integer, Pair<CorefMention, CorefMention>>> unpronoun = Phrase::_unpronoun;
private static Map<Integer, Pair<CorefMention, CorefMention>> _unpronoun(Phrase p) {
Stream<Pair<CorefMention, CorefMention>> s =
Stream.of(p.memo(Phrase.coreNLP).get(CorefChainAnnotation.class))
.filter(Objects::nonNull) // Do nothing with an empty map
.flatMap(chains -> chains.entrySet().stream()) // Disassemble the map
.flatMap(entry -> {
// Link each entry to it's main mention
CorefMention main = entry.getValue().getRepresentativeMention();
return entry.getValue().getMentionsInTextualOrder().stream()
.filter(mention -> mention != main)
.map(mention -> makePair(mention, main));
});
// Type inference chokes here so write it down then return.
return s.collect(HashMap::new,
(m, pair) -> m.put(pair.first.headIndex, pair),
(l, r) -> {});
}
/**
* Transitional shortcut for memo(Phrase:tokens)
* @deprecated
*/
public List<String> getTokens() {
return memo(Phrase.tokens);
}
/**
* Transitional shortcut for memo(Phrase:trees)
* @deprecated
*/
public List<Tree> getTrees() {
return memo(Phrase.trees);
}
/**
* Transitional shortcut for memo(Phrase:graphs)
* @deprecated
*/
public List<SemanticGraph> getGraphs() {
return memo(Phrase.graphs);
}
/**
* Transitional shortcut for memo(Phrase:unpronoun)
* @deprecated
*/
public Map<Integer, Pair<CorefMention, CorefMention>> getUnpronoun() {
return memo(Phrase.unpronoun);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((text == null) ? 0 : text.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Phrase other = (Phrase) obj;
if (text == null) {
if (other.text != null)
return false;
} else if (!text.equals(other.text))
return false;
return true;
}
/**
* Deserialize JSON into a Phrase.
* SemanticGraph, Tree and friends have cycles and we can regenerate them
* anyway so just mark them transient and reparse the Phrase later.
public static class Deserializer implements JsonDeserializer<Phrase> {
@Override
public Phrase deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context)
throws JsonParseException {
return new Phrase(json.getAsJsonObject().get("text").getAsString());
}
}*/
}