//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.base.Strings;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultimap;
import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM;
import uk.gov.dstl.baleen.annotators.misc.helpers.AbstractKeywordsAnnotator;
import uk.gov.dstl.baleen.annotators.misc.helpers.NoOpStemmer;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.utils.StopwordUtils;
import uk.gov.dstl.baleen.types.common.Buzzword;
import uk.gov.dstl.baleen.types.metadata.Metadata;
/**
* Uses the RAKE (Rapid Automatic Keyword Extraction) algorithm to automatically
* identify keywords in each document.
*
* These keywords will be added as metadata to the document, and optionally can
* also be added as Buzzwords
*
* Based on the paper 'Automatic keyword extraction from individual documents' by
* Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
*
* Optionally, you can choose to stem words prior to scoring, which will address
* any variability in words caused by plurals, tense, etc.
* This is an extension from the original paper. Essentially, the annotator maintains
* a mapping between a stemmed version and the original version of the key phrase,
* using the stemmed version for scoring and calculations, and then returning the
* original version when required for output.
*
* @baleen.javadoc
*/
public class RakeKeywords extends AbstractKeywordsAnnotator {
/**
* The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum, e.g. ENGLISH.
* If not set, or set to an undefined value, then no stemming will be used
*
* @baleen.config
*/
public static final String PARAM_STEMMING = "stemming";
@ConfigurationParameter(name = PARAM_STEMMING, defaultValue = "")
protected String stemming;
private Pattern stopwordPattern;
private Stemmer stemmer;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
if(!Strings.isNullOrEmpty(stemming)){
try{
ALGORITHM algo = ALGORITHM.valueOf(stemming);
stemmer = new SnowballStemmer(algo);
}catch(IllegalArgumentException iae){
getMonitor().warn("Value of {} does not match pre-defined list, no stemming will be used.", PARAM_STEMMING, iae);
stemmer = new NoOpStemmer();
}
}else{
stemmer = new NoOpStemmer();
}
stopwordPattern = StopwordUtils.buildStopwordPattern(stopwords, true);
}
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
List<StemmedString> candidates = new ArrayList<>();
//The definition of sentence as required by RAKE is different to that used by Baleen,
//so we can't use the existing Sentence annotation.
for(String sentence : splitSentences(getTextInTextBlocks(jCas))){
candidates.addAll(generateCandidates(sentence));
}
Map<StemmedString, Double> scores = calculateScores(candidates);
Map<StemmedString, Double> keywords = generateKeywordScores(candidates, scores);
Multimap<Double, StemmedString> keywordsByValue = TreeMultimap.create();
keywords.forEach((k, v) -> keywordsByValue.put(v, k));
Integer numKeywords = Integer.min(maxKeywords, keywords.size()/3);
List<Double> scoreValues = new ArrayList<>(keywordsByValue.keySet());
Integer index = scoreValues.size() - 1;
List<StemmedString> finalKeywords = new ArrayList<>();
while(finalKeywords.size() < numKeywords && index >= 0){
finalKeywords.addAll(keywordsByValue.get(scoreValues.get(index)));
index--;
}
List<String> keywordsString = finalKeywords.stream().map(s -> s.getOriginalString()).collect(Collectors.toList());
addKeywordsToJCas(jCas, keywordsString);
}
private List<StemmedString> generateCandidates(String sentence){
String[] candidates = stopwordPattern.split(sentence);
List<StemmedString> normalizedCandidates = new ArrayList<>();
for(String c : candidates){
if(c.trim().length() > 0){
String normalized = c.trim().toLowerCase();
normalizedCandidates.add(new StemmedString(normalized, stemmer.stem(normalized)));
}
}
return normalizedCandidates;
}
private Map<StemmedString, Double> calculateScores(List<StemmedString> candidates){
Map<StemmedString, Integer> degree = new HashMap<>();
Map<StemmedString, Double> score = new HashMap<>();
Multiset<StemmedString> words = HashMultiset.create();
for(StemmedString candidate : candidates){
List<StemmedString> splitWords = splitCandidate(candidate);
Integer listDegree = splitWords.size();
words.addAll(splitWords);
for(StemmedString word : splitWords){
int currDegree = degree.getOrDefault(word, 0);
degree.put(word, currDegree + listDegree);
}
}
for(StemmedString word : words){
score.put(word, degree.get(word) / (words.count(word) * 1.0));
}
return score;
}
private Map<StemmedString, Double> generateKeywordScores(List<StemmedString> candidates, Map<StemmedString, Double> scores){
Map<StemmedString, Double> keywords = new HashMap<>();
for(StemmedString candidate : candidates){
List<StemmedString> splitWords = splitCandidate(candidate);
Double candidateScore = 0.0;
for(StemmedString word : splitWords){
candidateScore += scores.getOrDefault(word, 0.0);
}
keywords.put(candidate, candidateScore);
}
return keywords;
}
private List<String> splitSentences(String text){
String[] sentences = text.split("[-.!?,;:\\n\\t\\\"\\'\\(\\)\u2019\u2013\\\\\\/]");
List<String> returnedSentences = new ArrayList<>();
for(String sentence : sentences){
if(sentence.trim().length() > 0){
returnedSentences.add(sentence.trim().toLowerCase());
}
}
return returnedSentences;
}
private List<StemmedString> splitCandidate(StemmedString candidate){
String[] splitOrig = candidate.getOriginalString().split("\\s+");
String[] splitStemmed = candidate.getStemmedString().split("\\s+");
List<StemmedString> split = new ArrayList<>();
for(int i = 0; i < splitOrig.length; i++){
split.add(new StemmedString(splitOrig[i], splitStemmed[i]));
}
return split;
}
@Override
public AnalysisEngineAction getAction() {
Set<Class<? extends Annotation>> outputs = new HashSet<>();
outputs.add(Metadata.class);
if(addBuzzwords)
outputs.add(Buzzword.class);
return new AnalysisEngineAction(Collections.emptySet(), outputs);
}
}
/**
* A class to hold two versions of a string in parallel - an original version and a stemmed version
*/
class StemmedString implements Comparable<StemmedString>{
private String strOrig;
private String strStemmed;
/**
* Create a StemmedString from two strings
*/
public StemmedString(String orig, String stemmed){
strOrig = orig;
strStemmed = stemmed;
}
/**
* Create a StemmedString from one CharSequence (original) and one String (stemmed)
*/
public StemmedString(CharSequence orig, String stemmed){
strOrig = orig.toString();
strStemmed = stemmed;
}
/**
* Create a StemmedString from one CharSequence (stemmed) and one String (original)
*/
public StemmedString(String orig, CharSequence stemmed){
strOrig = orig;
strStemmed = stemmed.toString();
}
/**
* Create a StemmedString from two CharSequences
*/
public StemmedString(CharSequence orig, CharSequence stemmed){
strOrig = orig.toString();
strStemmed = stemmed.toString();
}
/**
* Get the original string
*/
public String getOriginalString(){
return strOrig;
}
/**
* Get the stemmed string
*/
public String getStemmedString(){
return strStemmed;
}
@Override
public String toString(){
return strStemmed;
}
@Override
public int compareTo(StemmedString s) {
return strStemmed.compareTo(s.strStemmed);
}
@Override
public boolean equals(Object o){
if(o instanceof StemmedString || o instanceof String){
return strStemmed.equals(o.toString());
}
return false;
}
@Override
public int hashCode(){
return strStemmed.hashCode();
}
}