ParseTree.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.uima.grammar;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Stream;

import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.WordToken;

/**
 * A tree formed of a hierarchy of ParseChunks.
 *
 * @baleen.javadoc
 */
public class ParseTree {

	private static final Logger LOGGER = LoggerFactory.getLogger(ParseTree.class);

	private static final Comparator<? super ParseTreeNode> SENTENCE_ORDER = (a, b) -> Integer
			.compare(a.getChunk().getBegin(), b.getChunk().getBegin());
	private static final Comparator<? super AnnotationFS> SHORTEST_FIRST = (a, b) -> Integer
			.compare(a.getEnd() - a.getBegin(), b.getEnd() - b.getBegin());

	private final ParseTreeNode root;
	private final Map<PhraseChunk, ParseTreeNode> chunkToNode;
	private final Map<WordToken, ParseTreeNode> wordToNode;

	/**
	 * Instantiates a new parses the tree.
	 *
	 * @param roots
	 *            the roots
	 * @param chunkToNode
	 *            the chunk to node
	 * @param wordToNode
	 *            the word to node
	 */
	private ParseTree(List<ParseTreeNode> roots, Map<PhraseChunk, ParseTreeNode> chunkToNode,
			Map<WordToken, ParseTreeNode> wordToNode) {
		this.root = new ParseTreeNode(roots);
		this.chunkToNode = chunkToNode;
		this.wordToNode = wordToNode;
	}

	/**
	 * Gets the child words.
	 *
	 * @param chunk
	 *            the chunk
	 * @param chunkFilter
	 *            the chunk filter
	 * @return the child words
	 */
	public Stream<WordToken> getChildWords(PhraseChunk chunk, Predicate<String> chunkFilter) {
		final ParseTreeNode node = chunkToNode.get(chunk);
		if (node.hasChildren()) {
			return node.getChildren().stream().filter(c -> chunkFilter.test(c.getChunk().getChunkType()))
					.flatMap(c -> c.getWords().stream());
		} else {
			return node.getWords().stream();
		}
	}

	/**
	 * Traverse children.
	 *
	 * @param consumer
	 *            the consumer
	 */
	public void traverseChildren(Consumer<List<ParseTreeNode>> consumer) {
		consumer.accept(Collections.singletonList(root));
		root.traverseChildren(consumer);
	}

	/**
	 * Builds the tree.
	 *
	 * @param jCas
	 *            the j cas
	 * @return the parses the tree
	 */
	public static ParseTree build(JCas jCas) {

		// Build a tree phrase to phrase

		final Map<PhraseChunk, Collection<PhraseChunk>> index = JCasUtil.indexCovering(jCas, PhraseChunk.class,
				PhraseChunk.class);

		final Collection<PhraseChunk> phrases = JCasUtil.select(jCas, PhraseChunk.class);

		final List<ParseTreeNode> roots = new LinkedList<>();
		final Map<PhraseChunk, ParseTreeNode> chunkToNode = new HashMap<>();

		for (final PhraseChunk chunk : phrases) {

			ParseTreeNode treeNode = chunkToNode.get(chunk);
			if (treeNode == null) {
				treeNode = new ParseTreeNode(chunk);
				chunkToNode.put(chunk, treeNode);
			}

			final Collection<PhraseChunk> covering = index.get(chunk);
			if (covering == null || covering.isEmpty()) {
				// Nothing is covering this Jcas, so its a root
				roots.add(treeNode);
			} else {
				// This is covered, so we add the smallest one as out parent
				final PhraseChunk parent = findSmallest(covering);

				ParseTreeNode parentNode = chunkToNode.get(parent);
				if (parentNode == null) {
					parentNode = new ParseTreeNode(parent);
					chunkToNode.put(parent, parentNode);
				}

				treeNode.setParent(parentNode);
				parentNode.addChild(treeNode);

			}
		}

		// Add words to the tree

		final Map<PhraseChunk, Collection<WordToken>> wordIndex = JCasUtil.indexCovered(jCas, PhraseChunk.class,
				WordToken.class);

		final Map<WordToken, ParseTreeNode> wordToNode = new HashMap<>();

		chunkToNode.values().forEach(n -> {
			// Sort all tree nodes by sentence order
			n.getChildren().sort(SENTENCE_ORDER);

			// Get all the words which are within this chunk, and then remove those which are in children
			final Collection<WordToken> allWords = wordIndex.get(n.getChunk());
			if (allWords != null) {
				final List<WordToken> words = new ArrayList<>(allWords);

				// Remove the words which are covered by our children, leaving just our words
				if (n.hasChildren())
					n.getChildren().stream()
						.map(t -> wordIndex.get(t.getChunk()))
						.filter(Objects::nonNull).forEach(words::removeAll);

				// Add the words into the treenode
				n.addWords(words);
				words.stream().forEach(w -> wordToNode.put(w, n));
			}
		});

		// Sort roots

		roots.sort(SENTENCE_ORDER);

		return new ParseTree(roots, chunkToNode, wordToNode);
	}

	/**
	 * Find smallest (covered text length) covering chunk
	 *
	 * @param covering
	 *            the covering
	 * @return the phrase chunk
	 */
	private static PhraseChunk findSmallest(Collection<PhraseChunk> covering) {
		return covering.stream()
				.sorted(SHORTEST_FIRST)
				.findFirst()
				.get();

	}

	/**
	 * Gets the parent of the token
	 *
	 * @param token
	 *            the token
	 * @return the parent
	 */
	public ParseTreeNode getParent(WordToken token) {
		return wordToNode.get(token);
	}

	/**
	 * Output a basic representation of the tree
	 */
	public void log() {
		root.log("");

		wordToNode.forEach((w, n) -> LOGGER.info("{} : {} ", w.getCoveredText(), n.toString()));
	}

}