//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.gazetteer.helpers; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.ahocorasick.trie.Emit; import org.ahocorasick.trie.Trie; import org.ahocorasick.trie.Trie.TrieBuilder; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import opennlp.tools.stemmer.Stemmer; import opennlp.tools.stemmer.snowball.SnowballStemmer; import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.resources.gazetteer.IGazetteer; import uk.gov.dstl.baleen.types.BaleenAnnotation; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Abstract class that acts similarly to AbstractRadixTreeGazetteerAnnotator, but performs stemming * of terms prior to performing matching. * * This means that gazetteer terms don't necessarily have to be exact to match. For example, plurals * and different tenses should stem to the same root, and so would all be matched. * * Note that if multiple words in the gazetteer stem to the same form, then the coreferencing may * give incorrect results. * * @baleen.javadoc */ public abstract class AbstractStemmingAhoCorasickAnnotator extends AbstractAhoCorasickAnnotator { /** * The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum * * @baleen.config ENGLISH */ public static final String PARAM_ALGORITHM = "algorithm"; @ConfigurationParameter(name = PARAM_ALGORITHM, defaultValue = "ENGLISH") protected String algorithm; protected Stemmer stemmer; private static Pattern WORD_PATTERN = Pattern.compile("[a-z']+"); private final Map<String, String> stemmedToKey = new HashMap<>(); @Override public abstract IGazetteer configureGazetteer() throws BaleenException; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { ALGORITHM algo = ALGORITHM.valueOf(algorithm); if (algo == null) { algo = ALGORITHM.ENGLISH; } stemmer = new SnowballStemmer(algo); super.doInitialize(aContext); } @Override protected void buildTrie() { TrieBuilder builder = Trie.builder().onlyWholeWords(); if (!caseSensitive) { builder = builder.ignoreCase(); } for (String s : gazetteer.getValues()) { TransformedString stemmed = stem(s.trim()); builder = builder.addKeyword(stemmed.getTransformedString()); stemmedToKey.put(stemmed.getTransformedString(), stemmed.getOriginalString()); } trie = builder.build(); } @Override public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { Map<String, List<BaleenAnnotation>> entities = new HashMap<>(); TransformedString stemmed = stem(block.getCoveredText()); Collection<Emit> emits = trie.parseText(stemmed.getTransformedString()); for (Emit emit : emits) { try { Integer start = stemmed.getMapping().get(emit.getStart()); Integer end = stemmed.getMapping().get(emit.getEnd() + 1); validateSubstring(start, end, stemmed.getOriginalString()); String match = stemmed.getOriginalString().substring(start, end); String key = stemmedToKey.get(emit.getKeyword()); createEntityAndAliases(block, start, end, match, key, entities); } catch (BaleenException be) { getMonitor().error("Unable to create entity of type {} for value '{}'", entityType.getName(), emit.getKeyword(), be); continue; } } createReferenceTargets(block, entities.values()); } /** * Convert a word, or words, into their stemmed form and return it along with a mapping between * the original and transformed strings */ protected TransformedString stem(String words) { StringBuilder builder = new StringBuilder(); Map<Integer, Integer> indexMap = new HashMap<>(); Integer index = 0; String content = words.toLowerCase(); while (!content.isEmpty()) { indexMap.put(builder.length(), index); if (Character.isAlphabetic(content.charAt(0))) { Matcher m = WORD_PATTERN.matcher(content); m.find(); String match = m.group(); CharSequence stemmedMatch = stemmer.stem(match); builder.append(stemmedMatch); index += match.length(); content = content.substring(match.length()); } else { builder.append(content.substring(0, 1)); content = content.substring(1); index++; } } indexMap.put(builder.length(), index); return new TransformedString(words, builder.toString(), indexMap); } private void validateSubstring(Integer start, Integer end, String string) throws BaleenException { if (start == null) { throw new BaleenException("Variable start cannot be null"); } if (end == null) { throw new BaleenException("Variable end cannot be null"); } if (start < 0) { throw new BaleenException("Variable start cannot be less than 0"); } if (end > string.length()) { throw new BaleenException("Variable end cannot be greater than the string length"); } } }