RakeKeywords.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.base.Strings;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultimap;

import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM;
import uk.gov.dstl.baleen.annotators.misc.helpers.AbstractKeywordsAnnotator;
import uk.gov.dstl.baleen.annotators.misc.helpers.NoOpStemmer;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.utils.StopwordUtils;
import uk.gov.dstl.baleen.types.common.Buzzword;
import uk.gov.dstl.baleen.types.metadata.Metadata;

/**
 * Uses the RAKE (Rapid Automatic Keyword Extraction) algorithm to automatically
 * identify keywords in each document.
 * 
 * These keywords will be added as metadata to the document, and optionally can
 * also be added as Buzzwords
 * 
 * Based on the paper 'Automatic keyword extraction from individual documents' by 
 * Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
 * 
 * Optionally, you can choose to stem words prior to scoring, which will address
 * any variability in words caused by plurals, tense, etc.
 * This is an extension from the original paper. Essentially, the annotator maintains
 * a mapping between a stemmed version and the original version of the key phrase,
 * using the stemmed version for scoring and calculations, and then returning the
 * original version when required for output.
 * 
 * @baleen.javadoc
 */
public class RakeKeywords extends AbstractKeywordsAnnotator {

	/**
	 * The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum, e.g. ENGLISH.
	 * If not set, or set to an undefined value, then no stemming will be used
	 * 
	 * @baleen.config
	 */
	public static final String PARAM_STEMMING = "stemming";
	@ConfigurationParameter(name = PARAM_STEMMING, defaultValue = "")
	protected String stemming;


	private Pattern stopwordPattern;
	private Stemmer stemmer;

	@Override
	public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
		super.doInitialize(aContext);

		if(!Strings.isNullOrEmpty(stemming)){
			try{
				ALGORITHM algo = ALGORITHM.valueOf(stemming);
				stemmer = new SnowballStemmer(algo);
			}catch(IllegalArgumentException iae){
				getMonitor().warn("Value of {} does not match pre-defined list, no stemming will be used.", PARAM_STEMMING, iae);
				stemmer = new NoOpStemmer();
			}
		}else{
			stemmer = new NoOpStemmer();
		}

		stopwordPattern = StopwordUtils.buildStopwordPattern(stopwords, true);
	}

	@Override
	protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
		List<StemmedString> candidates = new ArrayList<>();

		//The definition of sentence as required by RAKE is different to that used by Baleen,
		//so we can't use the existing Sentence annotation.
		for(String sentence : splitSentences(getTextInTextBlocks(jCas))){
			candidates.addAll(generateCandidates(sentence));
		}

		Map<StemmedString, Double> scores = calculateScores(candidates);
		Map<StemmedString, Double> keywords = generateKeywordScores(candidates, scores);

		Multimap<Double, StemmedString> keywordsByValue = TreeMultimap.create();
		keywords.forEach((k, v) -> keywordsByValue.put(v, k));

		Integer numKeywords = Integer.min(maxKeywords, keywords.size()/3);

		List<Double> scoreValues = new ArrayList<>(keywordsByValue.keySet());
		Integer index = scoreValues.size() - 1;

		List<StemmedString> finalKeywords = new ArrayList<>();
		while(finalKeywords.size() < numKeywords && index >= 0){
			finalKeywords.addAll(keywordsByValue.get(scoreValues.get(index)));
			index--;
		}

		List<String> keywordsString = finalKeywords.stream().map(s -> s.getOriginalString()).collect(Collectors.toList());

		addKeywordsToJCas(jCas, keywordsString);
	}

	private List<StemmedString> generateCandidates(String sentence){
		String[] candidates = stopwordPattern.split(sentence);

		List<StemmedString> normalizedCandidates = new ArrayList<>();

		for(String c : candidates){
			if(c.trim().length() > 0){
				String normalized = c.trim().toLowerCase();

				normalizedCandidates.add(new StemmedString(normalized, stemmer.stem(normalized)));
			}
		}

		return normalizedCandidates;
	}

	private Map<StemmedString, Double> calculateScores(List<StemmedString> candidates){
		Map<StemmedString, Integer> degree = new HashMap<>();
		Map<StemmedString, Double> score = new HashMap<>();

		Multiset<StemmedString> words = HashMultiset.create();

		for(StemmedString candidate : candidates){
			List<StemmedString> splitWords = splitCandidate(candidate);
			Integer listDegree = splitWords.size();

			words.addAll(splitWords);

			for(StemmedString word : splitWords){
				int currDegree = degree.getOrDefault(word, 0);
				degree.put(word, currDegree + listDegree);
			}
		}

		for(StemmedString word : words){
			score.put(word, degree.get(word) / (words.count(word) * 1.0));
		}

		return score;
	}

	private Map<StemmedString, Double> generateKeywordScores(List<StemmedString> candidates, Map<StemmedString, Double> scores){
		Map<StemmedString, Double> keywords = new HashMap<>();

		for(StemmedString candidate : candidates){
			List<StemmedString> splitWords = splitCandidate(candidate);
			Double candidateScore = 0.0;

			for(StemmedString word : splitWords){
				candidateScore += scores.getOrDefault(word, 0.0);
			}

			keywords.put(candidate, candidateScore);
		}

		return keywords;
	}

	private List<String> splitSentences(String text){
		String[] sentences = text.split("[-.!?,;:\\n\\t\\\"\\'\\(\\)\u2019\u2013\\\\\\/]");

		List<String> returnedSentences = new ArrayList<>();
		for(String sentence : sentences){
			if(sentence.trim().length() > 0){
				returnedSentences.add(sentence.trim().toLowerCase());
			}
		}

		return returnedSentences;
	}

	private List<StemmedString> splitCandidate(StemmedString candidate){
		String[] splitOrig = candidate.getOriginalString().split("\\s+");
		String[] splitStemmed = candidate.getStemmedString().split("\\s+");

		List<StemmedString> split = new ArrayList<>();

		for(int i = 0; i < splitOrig.length; i++){
			split.add(new StemmedString(splitOrig[i], splitStemmed[i]));
		}

		return split;
	}
	
	@Override
	public AnalysisEngineAction getAction() {
		Set<Class<? extends Annotation>> outputs = new HashSet<>();
		outputs.add(Metadata.class);
		if(addBuzzwords)
			outputs.add(Buzzword.class);
		
		return new AnalysisEngineAction(Collections.emptySet(), outputs);
	}
}

/**
 * A class to hold two versions of a string in parallel - an original version and a stemmed version
 */
class StemmedString implements Comparable<StemmedString>{
	private String strOrig;
	private String strStemmed;

	/**
	 * Create a StemmedString from two strings
	 */
	public StemmedString(String orig, String stemmed){
		strOrig = orig;
		strStemmed = stemmed;
	}

	/**
	 * Create a StemmedString from one CharSequence (original) and one String (stemmed)
	 */
	public StemmedString(CharSequence orig, String stemmed){
		strOrig = orig.toString();
		strStemmed = stemmed;
	}
	/**
	 * Create a StemmedString from one CharSequence (stemmed) and one String (original)
	 */
	public StemmedString(String orig, CharSequence stemmed){
		strOrig = orig;
		strStemmed = stemmed.toString();
	}

	/**
	 * Create a StemmedString from two CharSequences
	 */
	public StemmedString(CharSequence orig, CharSequence stemmed){
		strOrig = orig.toString();
		strStemmed = stemmed.toString();
	}

	/**
	 * Get the original string
	 */
	public String getOriginalString(){
		return strOrig;
	}

	/**
	 * Get the stemmed string
	 */
	public String getStemmedString(){
		return strStemmed;
	}

	@Override
	public String toString(){
		return strStemmed;
	}

	@Override
	public int compareTo(StemmedString s) {
		return strStemmed.compareTo(s.strStemmed);
	}

	@Override
	public boolean equals(Object o){
		if(o instanceof StemmedString || o instanceof String){
			return strStemmed.equals(o.toString());
		}

		return false;
	}

	@Override
	public int hashCode(){
		return strStemmed.hashCode();
	}
}