CommonKeywords.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.StringJoiner;
import java.util.TreeSet;
import java.util.stream.Collectors;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.base.Strings;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM;
import uk.gov.dstl.baleen.annotators.misc.helpers.AbstractKeywordsAnnotator;
import uk.gov.dstl.baleen.annotators.misc.helpers.NoOpStemmer;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.utils.StopwordUtils;
import uk.gov.dstl.baleen.types.common.Buzzword;
import uk.gov.dstl.baleen.types.metadata.Metadata;

/**
 * This annotator attempts to identify keywords using the following process:
 * 1) Split document by stop words
 * 2) For each remaining word and/or phrase produce n-grams up to a maximum length
 * 3) Stem each n-gram
 * 4) Count the occurrences of each stemmed n-gram, weighting the count based on n-gram length
 * 5) Select the most commonly occurring n-grams
 * 6) Convert back to the original words
 * 
 * @baleen.javadoc
 */
public class CommonKeywords extends AbstractKeywordsAnnotator {
	/**
	 * The maximum n-gram length
	 * 
	 * @baleen.config 3
	 */
	public static final String PARAM_NGRAM_LENGTH = "ngram";
	@ConfigurationParameter(name = PARAM_NGRAM_LENGTH, defaultValue = "3")
	protected Integer maxLength;
	
	/**
	 * The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum, e.g. ENGLISH.
	 * If not set, or set to an undefined value, then no stemming will be used
	 * 
	 * @baleen.config ENGLISH
	 */
	public static final String PARAM_STEMMING = "stemming";
	@ConfigurationParameter(name = PARAM_STEMMING, defaultValue = "ENGLISH")
	protected String stemming;
	
	private Stemmer stemmer;
	private String stopwordPattern;
	
	@Override
	public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
		super.doInitialize(aContext);
		
		if(!Strings.isNullOrEmpty(stemming)){
			try{
				ALGORITHM algo = ALGORITHM.valueOf(stemming);
				stemmer = new SnowballStemmer(algo);
			}catch(IllegalArgumentException iae){
				getMonitor().warn("Value of {} does not match pre-defined list, no stemming will be used.", PARAM_STEMMING, iae);
				stemmer = new NoOpStemmer();
			}
		}else{
			 stemmer = new NoOpStemmer();
		}
		
		stopwordPattern = StopwordUtils.buildStopwordPattern(stopwords, true, "[-.!?0-9]").pattern();
	}
	
	@Override
	protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
    List<String> phrases =
        Arrays.asList(getTextInTextBlocks(jCas).toLowerCase().split(stopwordPattern));
		
		phrases = phrases.stream().filter(s -> s.length() > 0).collect(Collectors.toList());
		
		Map<String, Double> stemCount = new HashMap<>();
		Map<String, Integer> wordCount = new HashMap<>();
		Multimap<String, String> stemToWord = HashMultimap.create();
		
		for(String phrase : phrases){			
			String[] terms = phrase.split("\\s+");
			
			for(int i = 0; i < terms.length; i++){
				StringJoiner sjStem = new StringJoiner(" ");
				StringJoiner sjOrig = new StringJoiner(" ");
				for(int j = 0; j < maxLength && i + j < terms.length; j++){
					String origTerm = terms[i + j].replaceAll("^[-,\"\\(\\)':;]+", "").replaceAll("[-,\"\\(\\)':;]+$", "");
					String term = stemmer.stem(origTerm.trim().replaceAll("[^a-z]", "")).toString();
					
					if(term.length() == 0)
						break;
					
					sjStem.add(term);
					sjOrig.add(origTerm);
					
					Double weight = 1.0 + j/Math.max(1.0, maxLength - 1.0);	//Boost the score of longer words
					
					String key = sjStem.toString();
					Double dVal = stemCount.getOrDefault(key, 0.0);
					stemCount.put(key, dVal + weight);
					
					String origKey = sjOrig.toString();
					Integer iVal = wordCount.getOrDefault(origKey, 0);
					wordCount.put(origKey, iVal + 1);
					
					stemToWord.put(key, origKey);
				}
			}
		}
		
		stemCount.remove("");
		
		Multimap<Double, String> countToStem = HashMultimap.create();
		Set<Double> countValues = new TreeSet<>(Collections.reverseOrder());
		
		for(Entry<String, Double> e : stemCount.entrySet()){
			countToStem.put(e.getValue(), e.getKey());	//(Count, Key)
			countValues.add(e.getValue());
		}
		
		List<String> stemmedKeywords = new ArrayList<>();
		for(Double d : countValues){
			stemmedKeywords.addAll(countToStem.get(d));
			
			if(stemmedKeywords.size() >= maxKeywords)
				break;
		}
		
		unstemAndAddKeywords(jCas, stemmedKeywords, stemToWord, wordCount);
	}

	@Override
	public AnalysisEngineAction getAction() {
		Set<Class<? extends Annotation>> outputs = new HashSet<>();
		outputs.add(Metadata.class);
		if(addBuzzwords)
			outputs.add(Buzzword.class);
		
		return new AnalysisEngineAction(Collections.emptySet(), outputs);
	}
	
	private void unstemAndAddKeywords(JCas jCas, List<String> stemmedKeywords, Multimap<String, String> stemToWord, Map<String, Integer> wordCount){
		List<String> selectedKeywords = new ArrayList<>();
		List<String> additionalKeywords = new ArrayList<>();
		
		for(String stemmed : stemmedKeywords){
			Collection<String> keywords = stemToWord.get(stemmed);
			String bestKeyword = selectBestUnstemmedWord(keywords, wordCount);
			
			additionalKeywords.addAll(keywords);
			
			selectedKeywords.add(bestKeyword);
			additionalKeywords.remove(bestKeyword);
		}
		
		addKeywordsToJCas(jCas, selectedKeywords, additionalKeywords);
	}
	
	private String selectBestUnstemmedWord(Collection<String> keywords, Map<String, Integer> wordCount){
		String bestKeyword = "";
		Integer bestCount = 0;
		
		for(String keyword : keywords){
			Integer count = wordCount.get(keyword);
			if(count > bestCount){
				bestCount = count;
				bestKeyword = keyword;
			}
		}
		
		return bestKeyword;
	}
	
}