//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers.csv; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import org.apache.uima.UimaContext; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.resources.SharedStopwordResource; import uk.gov.dstl.baleen.resources.utils.StopwordUtils; import uk.gov.dstl.baleen.types.language.Sentence; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.types.semantic.Entity; /** * Write coreference information to a CSV. * <p> * The format is as follows: * <ul> * <li>source * <li>id * <li>reference * <li>type * <li>text * <li>value * <li>EntityCount * <li>then EntityCount * Entities (value, type) * <li>nonEntityNonStopWordsCount * <li>nonEntityNonStopWordsCount * nonEntityNonStopWords ( (format word then pos) * <li>NonStopWordsNotCoveredByEntitiesCount * <li>then NonStopWordsNotCoveredByEntitiesCount * NonStopWordsNotCoveredByEntities (format word * then pos) * </ul> * * @baleen.javadoc */ public class Coreference extends AbstractCsvConsumer { /** * The stoplist to use. If the stoplist matches one of the enum's provided in * {@link uk.gov.dstl.baleen.resources.SharedStopwordResource#StopwordList}, then * that list will be loaded. * * Otherwise, the string is taken to be a file path and that file is used. * The format of the file is expected to be one stopword per line. * * @baleen.config DEFAULT */ public static final String PARAM_STOPLIST = "stoplist"; @ConfigurationParameter(name = PARAM_STOPLIST, defaultValue="DEFAULT") protected String stoplist; /** * Connection to Stopwords Resource * * @baleen.resource uk.gov.dstl.baleen.resources.SharedStopwordResource */ public static final String KEY_STOPWORDS = "stopwords"; @ExternalResource(key = KEY_STOPWORDS) protected SharedStopwordResource stopwordResource; protected Collection<String> stopwords; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); try{ stopwords = stopwordResource.getStopwords(SharedStopwordResource.StopwordList.valueOf(stoplist)); }catch(IOException ioe){ getMonitor().error("Unable to load stopwords", ioe); throw new ResourceInitializationException(ioe); } write("source", "id", "reference", "type", "value", "EntityCount then Entities... " + "then nonEntityNonStopWords (format word then pos) " + "then NonStopWordsNotCoveredByEntitiesCount " + "then (format word then pos)..."); } @Override protected void write(JCas jCas) { final String source = getDocumentAnnotation(jCas).getSourceUri(); // For each entity we need to find all the other sentences they are contained in // This should be all entities and sentences final Map<Entity, Collection<Sentence>> coveringSentence = JCasUtil.indexCovering(jCas, Entity.class, Sentence.class); final Map<Sentence, Collection<Entity>> coveredEntities = JCasUtil.indexCovered(jCas, Sentence.class, Entity.class); final Map<Sentence, Collection<WordToken>> coveredTokens = JCasUtil.indexCovered(jCas, Sentence.class, WordToken.class); final Map<WordToken, Collection<Entity>> coveringEntity = JCasUtil.indexCovering(jCas, WordToken.class, Entity.class); JCasUtil.select(jCas, Entity.class).stream() .map(e -> convertEntityToRow(source, coveringSentence, coveredEntities, coveredTokens, coveringEntity, e)) .filter(s -> s.length > 0) .forEach(this::write); } private String[] convertEntityToRow(final String source, final Map<Entity, Collection<Sentence>> coveringSentence, final Map<Sentence, Collection<Entity>> coveredEntities, final Map<Sentence, Collection<WordToken>> coveredTokens, final Map<WordToken, Collection<Entity>> coveringEntity, Entity e) { final List<String> list = new ArrayList<>(); Sentence sentence = null; final Collection<Sentence> sentences = coveringSentence.get(e); if (!sentences.isEmpty()) { sentence = sentences.iterator().next(); } else { getMonitor().error("Entity without sentence {}", e.getCoveredText()); return new String[0]; } list.add(source); list.add(e.getExternalId()); if (e.getReferent() != null) { list.add(Long.toString(e.getReferent().getInternalId())); } else { list.add(""); } list.add(e.getType().getShortName()); list.add(normalize(e.getValue())); final Collection<Entity> entities = coveredEntities.get(sentence); // Entities final int entityCountIndex = list.size(); int entityCount = 0; list.add("0"); for (final Entity x : entities) { if (x.getInternalId() != e.getInternalId()) { list.add(normalize(x.getValue())); list.add(x.getType().getShortName()); entityCount++; } } list.set(entityCountIndex, Integer.toString(entityCount)); // Add (non-stop) words - separate out the entities from the other words final List<WordToken> entityNonStopWords = new ArrayList<>(); final List<WordToken> nonEntityNonStopWords = new ArrayList<>(); for (final WordToken t : coveredTokens.get(sentence)) { // Filter out entities final String word = t.getCoveredText(); if (StopwordUtils.isStopWord(word, stopwords, false)) { final Collection<Entity> collection = coveringEntity.get(t); if (collection == null || collection.isEmpty()) { nonEntityNonStopWords.add(t); } else if (!collection.stream().anyMatch(x -> e.getInternalId() == x.getInternalId())) { // Output any entity other than the one we are processing entityNonStopWords.add(t); } } } // Output list.add(Integer.toString(entityNonStopWords.size())); entityNonStopWords.forEach(t -> { list.add(normalize(t.getCoveredText())); list.add(t.getPartOfSpeech()); }); list.add(Integer.toString(nonEntityNonStopWords.size())); nonEntityNonStopWords.forEach(t -> { list.add(normalize(t.getCoveredText())); list.add(t.getPartOfSpeech()); }); return list.toArray(new String[list.size()]); } }