//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.misc.helpers; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.uima.UimaContext; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.exceptions.InvalidParameterException; import uk.gov.dstl.baleen.resources.SharedStopwordResource; import uk.gov.dstl.baleen.types.common.Buzzword; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator; import uk.gov.dstl.baleen.uima.data.TextBlock; import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils; /** * Abstract class to provide common functionality for Keyword extraction annotators * * @baleen.javadoc */ public abstract class AbstractKeywordsAnnotator extends BaleenTextAwareAnnotator { /** * Should the extracted keywords be annotated as Buzzwords within the document? * * @baleen.config true */ public static final String PARAM_ADD_BUZZWORDS = "addBuzzwords"; @ConfigurationParameter(name = PARAM_ADD_BUZZWORDS, defaultValue="true") protected Boolean addBuzzwords; /** * The maximum number of keywords to extract. * * The number of keywords may be less than this. * * If there are a number of keywords with the same score that would take the total * number of keywords over the limit, then all are included. * * @baleen.config 5 */ public static final String PARAM_MAX_KEYWORDS = "maxKeywords"; @ConfigurationParameter(name = PARAM_MAX_KEYWORDS, defaultValue="5") protected Integer maxKeywords; /** * The stoplist to use. If the stoplist matches one of the enum's provided in * {@link uk.gov.dstl.baleen.resources.SharedStopwordResource#StopwordList}, then * that list will be loaded. * * Otherwise, the string is taken to be a file path and that file is used. * The format of the file is expected to be one stopword per line. * * @baleen.config DEFAULT */ public static final String PARAM_STOPLIST = "stoplist"; @ConfigurationParameter(name = PARAM_STOPLIST, defaultValue="DEFAULT") protected String stoplist; /** * Connection to Stopwords Resource * * @baleen.resource uk.gov.dstl.baleen.resources.SharedStopwordResource */ public static final String KEY_STOPWORDS = "stopwords"; @ExternalResource(key = KEY_STOPWORDS) protected SharedStopwordResource stopwordResource; protected Collection<String> stopwords; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { try{ stopwords = stopwordResource.getStopwords(SharedStopwordResource.StopwordList.valueOf(stoplist)); }catch(IllegalArgumentException iae){ getMonitor().info("Value of {} does not match pre-defined list, assuming value is a file", PARAM_STOPLIST); getMonitor().debug("Unable to parse value of {} as StopwordList enum", PARAM_STOPLIST, iae); File f = new File(stoplist); try{ stopwords = stopwordResource.getStopwords(f); }catch(IOException ioe){ throw new ResourceInitializationException( new InvalidParameterException("Couldn't load stoplist", ioe) ); } }catch(IOException ioe){ getMonitor().warn("Unable to load Stopword list, resorting to default list", ioe); stopwords = stopwordResource.getStopwords(); } } /** * Add the supplied keywords to the CAS as Metadata and, if configured, Buzzwords */ protected void addKeywordsToJCas(JCas jCas, List<String> keywords){ Metadata md = new Metadata(jCas); md.setKey("keywords"); md.setValue(keywords.stream().collect(Collectors.joining(";"))); addToJCasIndex(md); if(addBuzzwords){ addAllKeywords(jCas, keywords); } } /** * Add the supplied keywords to the CAS as Metadata and, if configured, Buzzwords. * A list of additional buzzwords to be annotated can be provided, for example other variants * of the main list of keywords (e.g. machines as well as machine) */ protected void addKeywordsToJCas(JCas jCas, List<String> keywords, List<String> additionalBuzzwords){ Metadata md = new Metadata(jCas); md.setKey("keywords"); md.setValue(keywords.stream().collect(Collectors.joining(";"))); addToJCasIndex(md); if(addBuzzwords){ Set<String> allKeywords = new HashSet<>(keywords); allKeywords.addAll(additionalBuzzwords); // NOTE: This will add buzzwords outside the Text areas addAllKeywords(jCas, allKeywords); } } private void addAllKeywords(JCas jCas, Collection<String> allKeywords) { List<TextBlock> blocks = getTextBlocks(jCas); for (String keyword : allKeywords) { Pattern pattern = Pattern.compile("\\b" + Pattern.quote(keyword) + "\\b", Pattern.CASE_INSENSITIVE); for (TextBlock block : blocks) { Matcher m = pattern.matcher(block.getCoveredText()); while (m.find()) { Buzzword bw = block.newAnnotation(Buzzword.class, m.start(), m.end()); bw.setTags(UimaTypesUtils.toArray(jCas, Arrays.asList("keyword"))); addToJCasIndex(bw); } } } } }