//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.patterns; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.patterns.data.PatternExtract; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.resources.SharedStopwordResource; import uk.gov.dstl.baleen.resources.utils.StopwordUtils; import uk.gov.dstl.baleen.types.language.Pattern; import uk.gov.dstl.baleen.types.language.Sentence; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Finds patterns in document text. * * <p> * A pattern is a set of words between two entities. Patterns are typically used to form a training * set for relationship extraction. * * <p> * As a result this annotator must be run after Entity and WordToken annotations have been added to * the JCas. That is post POS tagging (e.g. by OpenNlp) and after entity extraction (and ideally * clean up). * * <p> * The algorithm can be described as follows: * * * <ol> * <li>For each sentence we find entities which are less than "windowSize" away from each other * (measured in words). These are our candidate patterns.</li> * <li>We filter any patterns containing negatives (e.g. the words no or not).</li> * <li>We then remove from each pattern any stop words and any other entities which appear within * the pattern text, then remove any patterns that are now empty.</li> * <li>We then create Pattern annotations. Pattern annotations hold the original range for each * pattern, plus the list of retained words (in the form of WordTokens).</li> * </ol> * * @baleen.javadoc */ public class PatternExtractor extends BaleenAnnotator { /** * Connection to Stopwords Resource * * @baleen.resource uk.gov.dstl.baleen.resources.SharedStopwordResource */ public static final String KEY_STOPWORDS = "stopwords"; @ExternalResource(key = KEY_STOPWORDS) protected SharedStopwordResource stopwordResource; /** * The stoplist to use. If the stoplist matches one of the enum's provided in * {@link uk.gov.dstl.baleen.resources.SharedStopwordResource#StopwordList}, then * that list will be loaded. * * Otherwise, the string is taken to be a file path and that file is used. * The format of the file is expected to be one stopword per line. * * @baleen.config DEFAULT */ public static final String PARAM_STOPLIST = "stoplist"; @ConfigurationParameter(name = PARAM_STOPLIST, defaultValue="DEFAULT") protected String stoplist; /** * The max distance (in words) between two entites in a sentence before they are considered * related by the verb between them. * * Use a small number to get a minimal set of high quality words. * * @baleen.config 5 */ public static final String PARAM_WINDOW_SIZE = "windowSize"; @ConfigurationParameter(name = PatternExtractor.PARAM_WINDOW_SIZE, defaultValue = "5") private int windowSize; protected Collection<String> stopwords; private final java.util.regex.Pattern negationRegex = java.util.regex.Pattern .compile("\\b((no)|(neither)|(not)|(never))\\b"); @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); try{ stopwords = stopwordResource.getStopwords(SharedStopwordResource.StopwordList.valueOf(stoplist)); }catch(IOException ioe){ getMonitor().error("Unable to load stopwords", ioe); throw new ResourceInitializationException(ioe); } } @Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { final Set<WordToken> wordsCoveredByEntites = JCasUtil.indexCovered(jCas, Entity.class, WordToken.class).values() .stream().flatMap(l -> l.stream()).collect(Collectors.toSet()); for (final Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { final List<Entity> entities = JCasUtil.selectCovered(jCas, Entity.class, sentence); final List<WordToken> words = JCasUtil.selectCovered(jCas, WordToken.class, sentence); // We discard any punctuation in our word list since this appears to be unpredictable // output from OPenNLP parsing and we just want to count word distance. // If we have "hello world" then we might can get "hello, world, " which variation POS // tags. This filter is a little bit of a mess as a result. final List<WordToken> wordIndexes = words.stream() .filter(w -> Character .isAlphabetic(w.getPartOfSpeech().charAt(0)) && w.getCoveredText().length() > 1) .collect(Collectors.toList()); // Find entities within (windowSize) words of one another final String text = jCas.getDocumentText(); final String lowerText = text.toLowerCase(); final List<PatternExtract> patterns = new ArrayList<PatternExtract>(); for (int i = 0; i < entities.size(); i++) { for (int j = i + 1; j < entities.size(); j++) { addPattern(entities.get(i), entities.get(j), patterns); } } // Filter out patterns which are too far way // Filter out patterns which contain no, not or neither patterns.stream() .filter(p -> { final int count = countWordsBetween(p, wordIndexes); return count >= 0 && count < windowSize; }) .filter(p -> { String covered = p.getCoveredText(lowerText); return !negationRegex.matcher(covered).find(); }) .forEach(p -> { // Remove any other entities from the pattern // Remove stop words from the pattern // TODO: I question this in the paper. Whilst it is true we don't want stop // words I think we want // to extract a phrase. Their example is "play a role" which becomes // "play,role" p.setWordTokens( removeAdditionalWords(words, p, wordsCoveredByEntites) .collect(Collectors.toList())); if (!p.isEmpty()) { outputPattern(jCas, p); } }); } } /** * Create and add the pattern, or do nothing if the entities overlap */ private void addPattern(Entity a, Entity b, List<PatternExtract> patterns){ if (a.getEnd() < b.getBegin()) { // A is before B patterns.add(new PatternExtract(a, b, a.getEnd(), b.getBegin())); } else if (a.getBegin() > b.getEnd()) { patterns.add(new PatternExtract(b, a, b.getEnd(), a.getBegin())); } else { // Overlapping entities ... ignore as no words between them } } /** * Count words between the pattern and words. * * @param p * the p * @param words * the words * @return the int */ private int countWordsBetween(PatternExtract p, final List<WordToken> words) { int begin = p.getStart(); int end = p.getEnd(); int startWord = -1; int endWord = -1; int i = 0; for (final WordToken w : words) { if (w.getBegin() >= begin && startWord == -1) { startWord = i; } if (w.getBegin() >= end && endWord == -1) { endWord = i - 1; } i++; } if (startWord == -1 || endWord == -1) { return -1; } return endWord - startWord; } /** * Removes the additional words from the pattern extractor. * * Filters out stop words and words outside the pattern. * * @param words * * @param pe * the pe * @param tokens * the tokens * @return the stream */ private Stream<WordToken> removeAdditionalWords(List<WordToken> words, final PatternExtract pe, final Set<WordToken> entityWords) { return words.stream() .filter(t -> t.getBegin() >= pe.getStart() && t.getEnd() <= pe.getEnd()) .filter(t -> !entityWords.contains(t)) .filter(t -> { String s = t.getCoveredText(); return s.length() > 1 && !StopwordUtils.isStopWord(s, stopwords, false); }); } /** * Output pattern (save to the jCas) * * @param jCas * the j cas * @param pattern * the pattern */ private void outputPattern(final JCas jCas, final PatternExtract pattern) { final Pattern a = new Pattern(jCas); a.setBegin(pattern.getStart()); a.setEnd(pattern.getEnd()); a.setSource(pattern.getFrom()); a.setTarget(pattern.getTo()); final List<WordToken> tokens = pattern.getWordTokens(); final FSArray array = new FSArray(jCas, tokens.size()); int i = 0; for (final WordToken w : tokens) { array.set(i, w); i++; } a.setWords(array); addToJCasIndex(a); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Sentence.class, WordToken.class, Entity.class), ImmutableSet.of(Pattern.class)); } }