//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.enhancers;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
/**
* Adds the sentence and its index (count from the start) to each mention.
* <p>
* Thanks to UIMA this is a tremendous amount of work for such a simple task.
*/
public class SentenceEnhancer {
/**
* Enhance the mentions by adding sentence information.
*
* @param jCas
* the j cas
* @param mentions
* the mentions
*/
public void enhance(JCas jCas, List<Mention> mentions) {
// Create a map (mention annotation) to sentence
final Set<WordToken> pronounAnnotation = mentions.stream()
.filter(p -> p.getType() == MentionType.PRONOUN)
.map(p -> (WordToken) p.getAnnotation())
.collect(Collectors.toSet());
final Set<Entity> entityAnnotation = mentions.stream()
.filter(p -> p.getType() == MentionType.ENTITY)
.map(p -> (Entity) p.getAnnotation())
.collect(Collectors.toSet());
final Set<PhraseChunk> npAnnotation = mentions.stream()
.filter(p -> p.getType() == MentionType.NP)
.map(p -> (PhraseChunk) p.getAnnotation())
.collect(Collectors.toSet());
final Map<WordToken, Collection<Sentence>> wordToSentence = JCasUtil.indexCovering(jCas, WordToken.class,
Sentence.class).entrySet().stream()
.filter(e -> pronounAnnotation.contains(e.getKey()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue));
final Map<Entity, Collection<Sentence>> entityToSentence = JCasUtil.indexCovering(jCas, Entity.class,
Sentence.class).entrySet().stream()
.filter(e -> entityAnnotation.contains(e.getKey()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue));
final Map<PhraseChunk, Collection<Sentence>> npToSentence = JCasUtil.indexCovering(jCas, PhraseChunk.class,
Sentence.class).entrySet().stream()
.filter(e -> npAnnotation.contains(e.getKey()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue));
// Create a sentence count
final List<Sentence> sentences = new ArrayList<Sentence>(JCasUtil.select(jCas, Sentence.class));
final Map<Sentence, Integer> sentenceIndex = IntStream.range(0, sentences.size())
.boxed()
.collect(Collectors.toMap(i -> sentences.get(i), i -> i));
// Map mentions to sentence index
mentions.forEach(m -> {
Collection<Sentence> collection = null;
switch (m.getType()) {
case ENTITY:
collection = entityToSentence.get(m.getAnnotation());
break;
case PRONOUN:
collection = wordToSentence.get(m.getAnnotation());
break;
case NP:
collection = npToSentence.get(m.getAnnotation());
break;
default:
collection = Collections.emptyList();
}
setSentence(m, collection, sentenceIndex);
});
}
private void setSentence(Mention m, Collection<Sentence> sentenceCollection, Map<Sentence, Integer> sentenceIndex){
Sentence s = null;
if (sentenceCollection != null && !sentenceCollection.isEmpty()) {
s = sentenceCollection.iterator().next();
}
if(s == null){
m.setSentence(null);
m.setSentenceIndex(Integer.MIN_VALUE);
}else{
m.setSentence(s);
m.setSentenceIndex(sentenceIndex.getOrDefault(s, Integer.MIN_VALUE));
}
}
}