//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.uima.grammar; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.function.Consumer; import java.util.function.Predicate; import java.util.stream.Stream; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.language.WordToken; /** * A tree formed of a hierarchy of ParseChunks. * * @baleen.javadoc */ public class ParseTree { private static final Logger LOGGER = LoggerFactory.getLogger(ParseTree.class); private static final Comparator<? super ParseTreeNode> SENTENCE_ORDER = (a, b) -> Integer .compare(a.getChunk().getBegin(), b.getChunk().getBegin()); private static final Comparator<? super AnnotationFS> SHORTEST_FIRST = (a, b) -> Integer .compare(a.getEnd() - a.getBegin(), b.getEnd() - b.getBegin()); private final ParseTreeNode root; private final Map<PhraseChunk, ParseTreeNode> chunkToNode; private final Map<WordToken, ParseTreeNode> wordToNode; /** * Instantiates a new parses the tree. * * @param roots * the roots * @param chunkToNode * the chunk to node * @param wordToNode * the word to node */ private ParseTree(List<ParseTreeNode> roots, Map<PhraseChunk, ParseTreeNode> chunkToNode, Map<WordToken, ParseTreeNode> wordToNode) { this.root = new ParseTreeNode(roots); this.chunkToNode = chunkToNode; this.wordToNode = wordToNode; } /** * Gets the child words. * * @param chunk * the chunk * @param chunkFilter * the chunk filter * @return the child words */ public Stream<WordToken> getChildWords(PhraseChunk chunk, Predicate<String> chunkFilter) { final ParseTreeNode node = chunkToNode.get(chunk); if (node.hasChildren()) { return node.getChildren().stream().filter(c -> chunkFilter.test(c.getChunk().getChunkType())) .flatMap(c -> c.getWords().stream()); } else { return node.getWords().stream(); } } /** * Traverse children. * * @param consumer * the consumer */ public void traverseChildren(Consumer<List<ParseTreeNode>> consumer) { consumer.accept(Collections.singletonList(root)); root.traverseChildren(consumer); } /** * Builds the tree. * * @param jCas * the j cas * @return the parses the tree */ public static ParseTree build(JCas jCas) { // Build a tree phrase to phrase final Map<PhraseChunk, Collection<PhraseChunk>> index = JCasUtil.indexCovering(jCas, PhraseChunk.class, PhraseChunk.class); final Collection<PhraseChunk> phrases = JCasUtil.select(jCas, PhraseChunk.class); final List<ParseTreeNode> roots = new LinkedList<>(); final Map<PhraseChunk, ParseTreeNode> chunkToNode = new HashMap<>(); for (final PhraseChunk chunk : phrases) { ParseTreeNode treeNode = chunkToNode.get(chunk); if (treeNode == null) { treeNode = new ParseTreeNode(chunk); chunkToNode.put(chunk, treeNode); } final Collection<PhraseChunk> covering = index.get(chunk); if (covering == null || covering.isEmpty()) { // Nothing is covering this Jcas, so its a root roots.add(treeNode); } else { // This is covered, so we add the smallest one as out parent final PhraseChunk parent = findSmallest(covering); ParseTreeNode parentNode = chunkToNode.get(parent); if (parentNode == null) { parentNode = new ParseTreeNode(parent); chunkToNode.put(parent, parentNode); } treeNode.setParent(parentNode); parentNode.addChild(treeNode); } } // Add words to the tree final Map<PhraseChunk, Collection<WordToken>> wordIndex = JCasUtil.indexCovered(jCas, PhraseChunk.class, WordToken.class); final Map<WordToken, ParseTreeNode> wordToNode = new HashMap<>(); chunkToNode.values().forEach(n -> { // Sort all tree nodes by sentence order n.getChildren().sort(SENTENCE_ORDER); // Get all the words which are within this chunk, and then remove those which are in children final Collection<WordToken> allWords = wordIndex.get(n.getChunk()); if (allWords != null) { final List<WordToken> words = new ArrayList<>(allWords); // Remove the words which are covered by our children, leaving just our words if (n.hasChildren()) n.getChildren().stream() .map(t -> wordIndex.get(t.getChunk())) .filter(Objects::nonNull).forEach(words::removeAll); // Add the words into the treenode n.addWords(words); words.stream().forEach(w -> wordToNode.put(w, n)); } }); // Sort roots roots.sort(SENTENCE_ORDER); return new ParseTree(roots, chunkToNode, wordToNode); } /** * Find smallest (covered text length) covering chunk * * @param covering * the covering * @return the phrase chunk */ private static PhraseChunk findSmallest(Collection<PhraseChunk> covering) { return covering.stream() .sorted(SHORTEST_FIRST) .findFirst() .get(); } /** * Gets the parent of the token * * @param token * the token * @return the parent */ public ParseTreeNode getParent(WordToken token) { return wordToNode.get(token); } /** * Output a basic representation of the tree */ public void log() { root.log(""); wordToNode.forEach((w, n) -> LOGGER.info("{} : {} ", w.getCoveredText(), n.toString())); } }