package uk.ac.shef.dcs.jate.algorithm; import org.apache.log4j.Logger; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.feature.AbstractFeature; import uk.ac.shef.dcs.jate.feature.FrequencyTermBased; import uk.ac.shef.dcs.jate.feature.TermComponentIndex; import uk.ac.shef.dcs.jate.model.JATETerm; import java.util.*; import java.util.concurrent.ForkJoinPool; /** * Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). * Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), * Text Mining: Theory and Applications: John Wiley & Sons. */ public class RAKE extends Algorithm { private static final Logger LOG = Logger.getLogger(RAKE.class.getName()); public static final String SUFFIX_WORD = "_WORD"; public static final String SUFFIX_TERM = "_TERM"; @Override public List<JATETerm> execute(Collection<String> candidates) throws JATEException { AbstractFeature feature = features.get(FrequencyTermBased.class.getName() + SUFFIX_WORD); validateFeature(feature, FrequencyTermBased.class); FrequencyTermBased fFeatureWords = (FrequencyTermBased) feature; AbstractFeature feature2 = features.get(FrequencyTermBased.class.getName() + SUFFIX_TERM); validateFeature(feature2, FrequencyTermBased.class); FrequencyTermBased fFeatureTerms = (FrequencyTermBased) feature2; AbstractFeature tciFeature = features.get(TermComponentIndex.class.getName()); validateFeature(tciFeature, TermComponentIndex.class); TermComponentIndex fFeatureTermCompIndex = (TermComponentIndex) tciFeature; int cores = Runtime.getRuntime().availableProcessors(); int maxPerWorker=candidates.size()/cores; if (maxPerWorker == 0) maxPerWorker = 50; StringBuilder msg = new StringBuilder("Beginning computing RAKE values, cores="); msg.append(cores).append(" total terms=" + candidates.size()).append(",") .append(" max terms per worker thread=").append(maxPerWorker); LOG.info(msg.toString()); ForkJoinPool forkJoinPool = new ForkJoinPool(cores); RAKEWorker worker = new RAKEWorker(new ArrayList<>(candidates), Integer.MAX_VALUE, fFeatureWords, fFeatureTerms, fFeatureTermCompIndex ); List<JATETerm> result = forkJoinPool.invoke(worker); Collections.sort(result); LOG.info("Complete"); return result; } }