Edges.java example

Explorer

uncc2014watsonsim-master
- src
  - main
    - java
      - edu
        uncc
        cs
        watsonsim
        Answer.java
        Configuration.java
        DBQuestionSource.java
        Database.java
        DefaultPipeline.java
        Environment.java
        KV.java
        KVTest.java
        Log.java
        Passage.java
        Phrase.java
        QClassDetection.java
        QType.java
        Question.java
        Score.java
        SentenceDetector.java
        StringUtils.java
        WatsonSim.java
        WebFrontend.java
        WebsocketFrontend.java
        datapreparation
        KingJamesBible.java
        index
        Bigrams.java
        Edges.java
        Indri.java
        Lucene.java
        Reindex.java
        Segment.java
        nlp
        ApproxStringIntMap.java
        ApproxStringIntMapTest.java
        ClueType.java
        DBPediaCandidateType.java
        DenseVectors.java
        DenseVectorsTest.java
        Redirects.java
        Relatedness.java
        RelatednessTest.java
        StringStack.java
        StringStackTest.java
        SupportCandidateType.java
        Trees.java
        Weighted.java
        researchers
        AnswerTrimming.java
        CombineScores.java
        HyphenTrimmer.java
        MergeAnswers.java
        MergeByCommonSupport.java
        MergeByText.java
        Normalize.java
        OpenNlpTests.java
        POSStructureScorer.java
        PassageRetrieval.java
        PersonRecognition.java
        RedirectSynonyms.java
        Researcher.java
        StatsDump.java
        StrictFilters.java
        TagLAT.java
        URLExpander.java
        WekaTee.java
        package-info.java
        scorers
        AnswerInPassage.java
        AnswerInQuestionScorer.java
        AnswerLength.java
        AnswerPOS.java
        AnswerScorer.java
        CommonConstituents.java
        Correct.java
        DateMatches.java
        ElliotMerschScorer.java
        Entropy.java
        EntropyTest.java
        GloveAnswerQuestionContext.java
        GloveAnswerQuestionContextTest.java
        JM_Scorer.java
        KensNLPScorer.java
        LATCheck.java
        LATMentions.java
        LuceneEcho.java
        Merge.java
        NERScorer.java
        NGram.java
        NamedEntityRecognizerScorer.java
        PassageCount.java
        PassageQuestionLengthRatio.java
        PassageScorer.java
        PassageScorerOpenNLPAda.java
        PassageTermMatch.java
        PercentWordsInCommon.java
        QAKeywordMatch.java
        QPKeywordMatch.java
        QuestionID.java
        Scorer.java
        SentenceSimilarity.java
        SkipBigram.java
        StephensonOpenNLPScorer.java
        TopPOS.java
        WPPageViews.java
        WShalabyScorer.java
        WordProximity.java
        package-info.java
        scripts
        ParallelStats.java
        WikipediaViewCounter.java
        WikiquoteParser.java
        WikiquoteQuote.java
        WiktionaryParser.java
        package-info.java
        search
        Anagrams.java
        BingSearcher.java
        CachingSearcher.java
        IndriSearcher.java
        LucenePassageSearcher.java
        LuceneSearcher.java
        MeanDVSearch.java
        MeanDVSearchTest.java
        Searcher.java
        SemanticVectorSearcher.java
  - test
    - java
      - edu
        uncc
        cs
        watsonsim
        AnswerMergeTest.java
        CoreNLPSentenceSimilarityTest.java
        DateMatchesTest.java
        LATDetectionTest.java
        QClassDetectionTest.java
        QuestionResultsScorerTest.java
        ReindexEdgesTest.java
        StringUtilsTest.java
        TypeDetectionTest.java

package edu.uncc.cs.watsonsim.index;

import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentSkipListMap;
import org.apache.log4j.Logger;
import org.iq80.leveldb.*;
import org.junit.Test;

import static org.fusesource.leveldbjni.JniDBFactory.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

import java.io.*;

import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
import edu.stanford.nlp.dcoref.Dictionaries;
import edu.stanford.nlp.dcoref.Dictionaries.Animacy;
import edu.stanford.nlp.dcoref.Dictionaries.Gender;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Triple;
import edu.uncc.cs.watsonsim.Database;
import edu.uncc.cs.watsonsim.Passage;
import edu.uncc.cs.watsonsim.Phrase;
import edu.uncc.cs.watsonsim.nlp.Trees;

public class Edges implements Segment {
	private ConcurrentSkipListMap<String, Integer> all_edges = new ConcurrentSkipListMap<>();
	private final Logger log = Logger.getLogger(getClass());
	private final DB ldb;
	private final Database sqldb;
	
	public static final class Edge extends Triple<String, String, String> {
		public Edge(String a, String b, String c) {
			super(a, b, c);
		}
	}
	
	public Edges(Database sqldb) {
		
		// Setup LevelDB
		Options options = new Options();
		options.createIfMissing(true);
		try {
			ldb = factory.open(new File("data/edges-leveldb-depparse-lemma0"), options);
		} catch (IOException e) {
			// If we can't open the database we're toast.
			e.printStackTrace();
			throw new RuntimeException(e);
		}
		
		// Setup the SQL db
		this.sqldb = sqldb;
		
	}
	
	/**
	 * Write the contents to disk
	 * @throws IOException
	 */
	public synchronized void flush() throws IOException {
		try (WriteBatch batch = ldb.createWriteBatch()) {
			// Take a snapshot
			ConcurrentSkipListMap<String, Integer> rels = all_edges;
			all_edges = new ConcurrentSkipListMap<>();
			System.out.println("writing out  " + rels.size() + " edges.");
			rels.forEach((key, value) -> {
				byte[] bkey = bytes(key);
				byte[] dbval = ldb.get(bkey);
				if (dbval != null)
					value += Integer.parseInt(asString(dbval));
				batch.put(bkey, bytes(value.toString()));
			});
			ldb.write(batch);
		}
	}

	@Override
	public synchronized void close() throws IOException {
		flush();
		
		/* Now populate the relational database using the leveldb
		 * This strange two-step process comes because:
		 *  1) Leveldb is about 10x faster for batched writes
		 *  2) Sqlite & Postgresql support concurrent readers
		 *  Otherwise, I would be thrilled to use either all the way.
		 */
		System.out.println("Pushing histograms into the main database.");
		try {
			sqldb.prep("DELETE FROM semantic_graph;").execute();
			sqldb.prep("PRAGMA synchronous=OFF;").execute();
			// source, tag, dest, count
			PreparedStatement graph = sqldb.prep("INSERT INTO semantic_graph VALUES (?, ?, ?, ?);");
			DBIterator i = ldb.iterator();
			i.seekToFirst(); // for() doesn't work
			Map.Entry<byte[],byte[]> entry;
			int queue=0;
			while ((entry = i.next()) != null) {
				String[] words = asString(entry.getKey()).split("\t");
				try {
					graph.setString(1, words[0]);
					graph.setString(2, words[1]);
					graph.setString(3, words[2]);
					graph.setInt(4, Integer.parseInt(asString(entry.getValue())));
					graph.addBatch();
					if (++queue % 1000000 == 0) {
						System.out.println("Enqueued " + queue + " rows");
						graph.executeBatch();
						//sqldb.commit();
					}
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
			i.close();
			System.out.println("SQL batch " + graph.executeBatch().length);
		} catch (SQLException e) {
			e.printStackTrace();
			// Call it an IO exception
			throw new IOException(e);
		}	
	}
	
	/**
	 * Returns some new rules learned about a pronoun given its match
	 * context from anaphora resolution.
	 * 
	 * Specifically, we fill in the tags
	 * 
	 * _animate(main mention, ___).
	 * _gender(main mention, ___).
	 * _number(main mention, ___).
	 * 
	 * Basically, we can tell if it is animate, it's gender, and it's count.
	 * @return A list of semantic notes.
	 */
	public static List<Edge> generatePronounEdges(
			SemanticGraph g, IndexedWord w, Phrase t) {
		List<Edge> edges = new ArrayList<>();
		if (t.getUnpronoun().containsKey(w.index())) {
			// Use what we know about the pronoun
			Pair<CorefMention, CorefMention> mention_edge = t.getUnpronoun().get(w.index());
			String main_noun = Trees.concatNoun(g, g.getNodeByIndex(mention_edge.second.headIndex));
			
			Animacy is_animate = mention_edge.first.animacy;
			if (is_animate != Animacy.UNKNOWN) {
				edges.add(new Edge(
					main_noun, "_animate", is_animate.toString()));
			}
			
			Gender gender = mention_edge.first.gender;
			if (gender != Gender.UNKNOWN) {
				edges.add(new Edge(
					main_noun, "_gender", gender.toString()));
			}
			
			Dictionaries.Number number = mention_edge.first.number;
			if (number != Dictionaries.Number.UNKNOWN) {
				edges.add(new Edge(
					main_noun, "_number", number.toString()));
			}
		}
		return edges;
	}
	
	/**
	 * Get the full text of the main mention of a particular word, if it has a
	 * better mention. Otherwise just get it's segment of the tree using
	 * concatNoun()
	 * 
	 * @param phrase
	 * @param w
	 * @return
	 */
	public static String getMainMention(
			Phrase phrase, SemanticGraph graph, IndexedWord word) {
		Pair<CorefMention, CorefMention> linked_refs =
				phrase.getUnpronoun().get(word.index());
		if (linked_refs == null) {
			return Trees.concatNoun(graph, word);
		} else {
			return linked_refs.second.mentionSpan;
		}
	}

	
	/**
	 * Take a passage and find relevant semantic edges in it.
	 * 
	 * 1) We know that handling these words one at a time yields very
	 * boring results. We can connect "Donald" with "Duck," which is neat, but
	 * we can't tell anything Donald does that other ducks do not because any
	 * verb will be attached to "duck" but know nothing about Donald.
	 * 
	 * So to solve this, we connect all the [nn, cd] links, and invert
	 * "prep_of" links, prepending them. (This is the Trees.concatNoun method).
	 * How many of these are worth joining is up for debate. But it has to be
	 * consistent for indexing and later querying. Suppose we found "Donald
	 * duck is a cool cartoon character." We'll get a higher level relation
	 * like nsubj("Donald duck", "cartoon character") -> 1
	 * 
	 * 2) We find that a lot of the links are to pronouns. So in the links,
	 * we replace the pronouns with their "representative mentions", using
	 * CoreNLP's dcoref.
	 * 
	 * 3) For good measure, we include a few fake tags to indicate other
	 * tidbits we learned about relations. For example, we know gender
	 * and animation based on the pronouns used with something.
	 * 
	 * 	tag      | meaning
	 *  -------------------
	 *  _gender  | he / she, if available
	 *  _animate | (he/she), it
	 *  _number  | how many there are
	 *  _isa     | lexical type
	 * 
	 * 
	 * ## Possible investigation for later
	 * We can also join relations. Where we have relations that look like this:
	 * 
	 *  tagname(words, words)
	 *  
	 * We may want something that looks more like this:
	 * 
	 *  tagname [word tagname]* (words, words)
	 *  
	 * That way we can bridge across common concepts and get to the more
	 * interesting links they bridge. It sounds logical to me but I can't come
	 * up with any convincing examples where it would actually be useful to
	 * know, and I seem to find many transitive connections that are
	 * irrelevant.
	 * 
	 */
	public static List<Edge> generateEdges(Phrase phrase) {
		List<Edge> edges = new ArrayList<>();
		phrase.getGraphs().forEach(g -> {
			g.edgeIterable().forEach(e -> {
				if (e.getRelation().getShortName() != "nn") {
					// "nn" is garbled by the concatNoun() anyway
					
					// Dcoref on the source and target
					//edges.addAll(generatePronounEdges(g, e.getSource(), phrase));
					
					// Find the main mention and optionally concat it 
					String source = getMainMention(phrase, g, e.getSource());
					String target = getMainMention(phrase, g, e.getTarget());
					
					edges.add(new Edge(
							source,
							Trees.getSpecificPreps(e.getRelation()),
							target));
				}
			});
		});
		
		// Also extract the types while we are at it.
		/*SupportCandidateType.extract(phrase).forEach(nt -> {
			edges.add(new Edge(nt.first, "_isa", nt.second));
		});*/
		return edges;
	}
	
	@Override
	/**
	 * Stores the edges resulting from generateEdges into a database,
	 * delimiting the keys by tabs, since spaces are taken by concatNoun().
	 */
	public void accept(Passage t) {
		generateEdges(t).forEach(edge -> {
			all_edges.merge(
					edge.first + "\t"
						+ edge.second + "\t"
						+ edge.third,
					1,
					(a, b) -> a+b);
		});
			
		// Try to keep it from absorbing all available memory
		if (all_edges.size() > 1_000_000)
			try {
				flush();
			} catch (IOException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
	}
	

	@Test
	public void testGenerateEdges() {
		Phrase p = new Phrase("This is an example.");
		assertEquals(null, Edges.generateEdges(p));
		fail("Not yet implemented");
	}

}