Phrase.java example

Explorer

uncc2014watsonsim-master
- src
  - main
    - java
      - edu
        uncc
        cs
        watsonsim
        Answer.java
        Configuration.java
        DBQuestionSource.java
        Database.java
        DefaultPipeline.java
        Environment.java
        KV.java
        KVTest.java
        Log.java
        Passage.java
        Phrase.java
        QClassDetection.java
        QType.java
        Question.java
        Score.java
        SentenceDetector.java
        StringUtils.java
        WatsonSim.java
        WebFrontend.java
        WebsocketFrontend.java
        datapreparation
        KingJamesBible.java
        index
        Bigrams.java
        Edges.java
        Indri.java
        Lucene.java
        Reindex.java
        Segment.java
        nlp
        ApproxStringIntMap.java
        ApproxStringIntMapTest.java
        ClueType.java
        DBPediaCandidateType.java
        DenseVectors.java
        DenseVectorsTest.java
        Redirects.java
        Relatedness.java
        RelatednessTest.java
        StringStack.java
        StringStackTest.java
        SupportCandidateType.java
        Trees.java
        Weighted.java
        researchers
        AnswerTrimming.java
        CombineScores.java
        HyphenTrimmer.java
        MergeAnswers.java
        MergeByCommonSupport.java
        MergeByText.java
        Normalize.java
        OpenNlpTests.java
        POSStructureScorer.java
        PassageRetrieval.java
        PersonRecognition.java
        RedirectSynonyms.java
        Researcher.java
        StatsDump.java
        StrictFilters.java
        TagLAT.java
        URLExpander.java
        WekaTee.java
        package-info.java
        scorers
        AnswerInPassage.java
        AnswerInQuestionScorer.java
        AnswerLength.java
        AnswerPOS.java
        AnswerScorer.java
        CommonConstituents.java
        Correct.java
        DateMatches.java
        ElliotMerschScorer.java
        Entropy.java
        EntropyTest.java
        GloveAnswerQuestionContext.java
        GloveAnswerQuestionContextTest.java
        JM_Scorer.java
        KensNLPScorer.java
        LATCheck.java
        LATMentions.java
        LuceneEcho.java
        Merge.java
        NERScorer.java
        NGram.java
        NamedEntityRecognizerScorer.java
        PassageCount.java
        PassageQuestionLengthRatio.java
        PassageScorer.java
        PassageScorerOpenNLPAda.java
        PassageTermMatch.java
        PercentWordsInCommon.java
        QAKeywordMatch.java
        QPKeywordMatch.java
        QuestionID.java
        Scorer.java
        SentenceSimilarity.java
        SkipBigram.java
        StephensonOpenNLPScorer.java
        TopPOS.java
        WPPageViews.java
        WShalabyScorer.java
        WordProximity.java
        package-info.java
        scripts
        ParallelStats.java
        WikipediaViewCounter.java
        WikiquoteParser.java
        WikiquoteQuote.java
        WiktionaryParser.java
        package-info.java
        search
        Anagrams.java
        BingSearcher.java
        CachingSearcher.java
        IndriSearcher.java
        LucenePassageSearcher.java
        LuceneSearcher.java
        MeanDVSearch.java
        MeanDVSearchTest.java
        Searcher.java
        SemanticVectorSearcher.java
  - test
    - java
      - edu
        uncc
        cs
        watsonsim
        AnswerMergeTest.java
        CoreNLPSentenceSimilarityTest.java
        DateMatchesTest.java
        LATDetectionTest.java
        QClassDetectionTest.java
        QuestionResultsScorerTest.java
        ReindexEdgesTest.java
        StringUtilsTest.java
        TypeDetectionTest.java

package edu.uncc.cs.watsonsim;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.stream.Stream;

import static java.util.stream.Collectors.toList;

import java.util.Arrays;

import org.apache.commons.lang3.StringEscapeUtils;

import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;

import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import static edu.stanford.nlp.util.Pair.makePair;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;

/**
 * A String, tokenized, parsed into Trees, and as a semantic graph.
 * 
 */
public class Phrase {
	public final String text;
	private static final Cache<String, Phrase> recent; 

	// Cached Fields
	private transient ConcurrentHashMap<Function<? extends Phrase, ?>, Object> memos;
	public transient Log log = Log.NIL;
	
	// Create a pipeline
	private static final StanfordCoreNLP pipeline;
	private static final StanfordCoreNLP constituency_parse_pipeline;
	static {
		pipeline = makeCoreNLPPipeline("tokenize, cleanxml, ssplit, pos, lemma, parse");
		constituency_parse_pipeline = makeCoreNLPPipeline("tokenize, cleanxml, ssplit, pos, lemma, parse");
		// Save time by caching some, but not too many, recent parses.
	    recent = CacheBuilder.newBuilder()
	    	.concurrencyLevel(50)
	    	.maximumSize(10000)
	    	.weakValues()
	    	.build();
	}
	
	/** We still need to use pipelines from other systems. So we make them
	 * statically and use them elsewhere.
	 * @param annotators
	 * @return
	 */
	private static StanfordCoreNLP makeCoreNLPPipeline(String annotators) {
		// Creates an NLP pipeline missing ner, and dcoref 
	    Properties props = new Properties();
	    props.put("annotators", annotators);
	    // Use the faster parsing but slower loading shift-reduce models
	    props.put("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz");
	    // When you find something untokenizable, delete it and don't whine
	    props.put("tokenize.options", "untokenizable=noneDelete");
	    return new StanfordCoreNLP(props);
	}
	
	/**
	 * This no-args constructor exists solely for deserialization
	 */
	private Phrase() {
		text = "";
		memos = new ConcurrentHashMap<>();
		log = Log.NIL;
	}
	
	/**
	 * Create a new NLP parsed phrase.
	 * This will throw an NPE rather than take null text.
	 * The memo system is lazy and phrases are cached so this is quite cheap.
	 */
	public Phrase(String text) {
		if (text == null)
			throw new NullPointerException("Text cannot be null.");
		Phrase cache_entry = recent.getIfPresent(text);
		if (cache_entry != null) {
			this.text = cache_entry.text;
			// Memos are mutable but private and thread-safe.
			this.memos = cache_entry.memos;
		} else {
			this.memos = new ConcurrentHashMap<>();
			this.text = StringEscapeUtils.unescapeXml(text);
			recent.put(text, this);
		}
	}
	
	/**
	 * Lightweight functional annotations. Either apply the function and get
	 * the result, or if it has been done, return the existing value.
	 * 
	 * Here's the cute part: You to annotate recursively to make pipelines.
	 * 
	 * There are caveats: You need to be sure your function input type matches
	 * the type you are annotating or you will get runtime errors. The output
	 * types, however, are compile time type checked. This is fixable but makes
	 * the API uglier so we don't enforce it.
	 * Also, if your annotator returns null, the result will not be cached. So
	 * if your annotator is expensive, return some singleton instead.
	 */
	@SuppressWarnings("unchecked")
	public <X, T extends Phrase> X memo(Function<T, X> app) {
		/*
		 * Atomicity is not necessary here because the functions are
		 * idempotent. Enforcing atomicity can cause a deadlock, because
		 * memo() needs to be reentrant. Instead, just allow duplicate put()'s
		 */
		X output = (X) memos.get(app);
		if (output == null)
			output = app.apply((T) this);
		if (output != null)
			memos.put(app, output);
		return output;
	}
	
	/*
	 * Convenience functions for common annotations
	 */
	private static final Function<Phrase, Annotation> coreNLP = Phrase::_coreNLP;
	private static Annotation _coreNLP(Phrase p) {
		// create an empty Annotation just with the given text
	    Annotation document = new Annotation(p.text);
	    
	    try{
	    	// run all Annotators on this text
	    	pipeline.annotate(document);
		} catch (IllegalArgumentException | NullPointerException ex) {
			/*
			 *  On extremely rare occasions (< 0.00000593% of passages)
			 *  it will throw an error like the following:
			 *  
			 *  Exception in thread "main" java.lang.IllegalArgumentException:
			 *  No head rule defined for SYM using class edu.stanford.nlp.trees.SemanticHeadFinder in SYM-10
			 *  
			 *  On more frequent occasions, you get the following:
			 *  Exception in thread "main" java.lang.NullPointerException
    		 *  at edu.stanford.nlp.dcoref.RuleBasedCorefMentionFinder.findHead(RuleBasedCorefMentionFinder.java:276)
    		 *  
    		 *  Both of these are fatal for the passage.
    		 *  Neither are a big deal for the index. Forget them.
			 */
		}
	    return document;
	}
	
	/**
	 * Return CoreNLP sentences.
	 * Never returns null, only empty collections.
	 */
	private static final Function<Phrase, List<CoreMap>> sentences = Phrase::_sentences;
	private static List<CoreMap> _sentences(Phrase p) {
	    return Optional.ofNullable(
	    			p.memo(Phrase.coreNLP)
	    				.get(SentencesAnnotation.class))
    				.orElse(Collections.emptyList());
	    			
	}
	
	/**
	 * Return CoreNLP constituency trees
	 */
	public static final Function<Phrase, List<Tree>> trees = Phrase::_trees;
	private static List<Tree> _trees(Phrase p) {
		// create an empty Annotation just with the given text
	    Annotation document = p.memo(Phrase.coreNLP);
	    
	    try{
	    	// Run the full parse on this text
	    	constituency_parse_pipeline.annotate(document);
		} catch (IllegalArgumentException | NullPointerException ex) {
			/*
			 *  On extremely rare occasions (< 0.00000593% of passages)
			 *  it will throw an error like the following:
			 *  
			 *  Exception in thread "main" java.lang.IllegalArgumentException:
			 *  No head rule defined for SYM using class edu.stanford.nlp.trees.SemanticHeadFinder in SYM-10
			 *  
			 *  On more frequent occasions, you get the following:
			 *  Exception in thread "main" java.lang.NullPointerException
    		 *  at edu.stanford.nlp.dcoref.RuleBasedCorefMentionFinder.findHead(RuleBasedCorefMentionFinder.java:276)
    		 *  
    		 *  Both of these are fatal for the passage.
    		 *  Neither are a big deal for the index. Forget them.
			 */
		}
		return p.memo(Phrase.sentences)
				.stream()
				.map(s -> s.get(TreeAnnotation.class))
				.filter(Objects::nonNull)
				.collect(toList());
	}
	
	/**
	 * Return Lucene tokens
	 */
	public static Function<Phrase, List<String>> tokens = Phrase::_tokens;
	private static List<String> _tokens(Phrase p) {
		return StringUtils.tokenize(p.text);
	}
	
	/**
	 * Return very lightly processed tokens.
	 * TODO: Imitate the token processing in Glove
	 */
	public static Function<Phrase, List<String>> simpleTokens = Phrase::_simpleTokens;
	private static List<String> _simpleTokens(Phrase p) {
		return Arrays.asList(p.text.split("\\W+"));
	}
	
	/**
	 * Return CoreNLP dependency trees
	 */
	public static final Function<Phrase, List<SemanticGraph>> graphs = Phrase::_graphs;
	private static List<SemanticGraph> _graphs(Phrase p) {
		return p.memo(Phrase.sentences)
				.stream()
				.map(s -> s.get(CollapsedCCProcessedDependenciesAnnotation.class))
				.filter(Objects::nonNull)
				.collect(toList());
	}
	
	/**
	 * Annotation for lemmatized tokens 
	 */
	public static final Function<Phrase, List<String>> lemmas = Phrase::_lemmas;
	private static List<String> _lemmas(Phrase p) {
		return p.memo(Phrase.sentences)
				.stream()
				.flatMap(s -> s.get(TokensAnnotation.class).stream())
				.map( t -> t.get(LemmaAnnotation.class))
				.collect(toList());
	}
	
	/**
	 * Get a map for finding the main mention of any Coref
	 */
	public static final Function<Phrase, Map<Integer, Pair<CorefMention, CorefMention>>> unpronoun = Phrase::_unpronoun;
	private static Map<Integer, Pair<CorefMention, CorefMention>> _unpronoun(Phrase p) {
		Stream<Pair<CorefMention, CorefMention>> s =
				Stream.of(p.memo(Phrase.coreNLP).get(CorefChainAnnotation.class))
			.filter(Objects::nonNull)  // Do nothing with an empty map
			.flatMap(chains -> chains.entrySet().stream()) // Disassemble the map
		    .flatMap(entry -> {
				// Link each entry to it's main mention
				CorefMention main = entry.getValue().getRepresentativeMention();
				return entry.getValue().getMentionsInTextualOrder().stream()
					.filter(mention -> mention != main)
					.map(mention -> makePair(mention, main));
			});
		// Type inference chokes here so write it down then return.
		return s.collect(HashMap::new,
				(m, pair) -> m.put(pair.first.headIndex, pair),
				(l, r) -> {});
	}

	/**
	 * Transitional shortcut for memo(Phrase:tokens)
	 * @deprecated
	 */
	public List<String> getTokens() {
		return memo(Phrase.tokens);
	}

	/**
	 * Transitional shortcut for memo(Phrase:trees)
	 * @deprecated
	 */
	public List<Tree> getTrees() {
		return memo(Phrase.trees);
	}

	/**
	 * Transitional shortcut for memo(Phrase:graphs)
	 * @deprecated
	 */
	public List<SemanticGraph> getGraphs() {
		return memo(Phrase.graphs);
	}

	/**
	 * Transitional shortcut for memo(Phrase:unpronoun)
	 * @deprecated
	 */
	public Map<Integer, Pair<CorefMention, CorefMention>> getUnpronoun() {
		return memo(Phrase.unpronoun);
	}
	
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((text == null) ? 0 : text.hashCode());
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		Phrase other = (Phrase) obj;
		if (text == null) {
			if (other.text != null)
				return false;
		} else if (!text.equals(other.text))
			return false;
		return true;
	}


	/**
	 * Deserialize JSON into a Phrase.
	 * SemanticGraph, Tree and friends have cycles and we can regenerate them
	 * anyway so just mark them transient and reparse the Phrase later.
	 
	public static class Deserializer implements JsonDeserializer<Phrase> {
		@Override
		public Phrase deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context)
				throws JsonParseException {
			return new Phrase(json.getAsJsonObject().get("text").getAsString());
		}
	}*/
}