package uk.ac.shef.dcs.jate.algorithm; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.feature.AbstractFeature; import uk.ac.shef.dcs.jate.feature.FrequencyTermBased; import uk.ac.shef.dcs.jate.model.JATETerm; import java.util.*; import org.apache.log4j.Logger; /** * An implementation of the GlossEx term recognition algorithm. See Park, et. al 2002, <i> * Automatic Glossary Extraction: beyond terminology identification</i> *. This is the implementation of the scoring formula <b>only</b>, and does not include the filtering algorithm as mentioned * in the paper. * <p> * In the equation C(T) = a* TD(T) + B*TC(T), default a=0.2, B = 0.8. * </p> * * You might need to modify the value of B by increasing it substaintially when the reference corpus is relatively * much bigger than the target corpus, such as the BNC corpus. For details, please refer to the paper. * */ public class GlossEx extends ReferenceBased { protected final double alpha; protected final double beta; private static final Logger LOG = Logger.getLogger(GlossEx.class.getName()); public static final String SUFFIX_REF ="_REF"; public static final String SUFFIX_WORD ="_WORD"; public GlossEx(){ this(0.2,0.8,true); } public GlossEx(double alpha, double beta, boolean matchOOM){ super(matchOOM); this.alpha = alpha; this.beta = beta; } public List<JATETerm> execute(Collection<String> candidates) throws JATEException { AbstractFeature feature = features.get(FrequencyTermBased.class.getName()); validateFeature(feature, FrequencyTermBased.class); FrequencyTermBased fFeatureTerms = (FrequencyTermBased) feature; AbstractFeature feature2 = features.get(FrequencyTermBased.class.getName() + SUFFIX_WORD); validateFeature(feature2, FrequencyTermBased.class); FrequencyTermBased fFeatureWords = (FrequencyTermBased) feature2; AbstractFeature feature3 = features.get(FrequencyTermBased.class.getName() + SUFFIX_REF); validateFeature(feature3, FrequencyTermBased.class); FrequencyTermBased fFeatureRef = (FrequencyTermBased) feature3; nullWordProbInReference = setNullWordProbInReference(fFeatureRef); double refScalar = matchOrdersOfMagnitude(fFeatureWords, fFeatureRef); List<JATETerm> result = new ArrayList<>(); double totalWordsInCorpus = fFeatureWords.getCorpusTotal(); LOG.info("Calculating GlossEx for "+candidates.size()+" candidate terms."); for (String tString : candidates) { int ttf = fFeatureTerms.getTTF(tString); double score; String[] elements = tString.split(" "); double T = (double) elements.length; double SUMwi = 0.0; double SUMfwi = 0.0; //some terms are artificially incremented by 1 to avoid division by 0 and also to for (int i = 0; i < T; i++) { String wi = elements[i]; double pc_wi = fFeatureRef.getTTFNorm(wi); if (pc_wi == 0) pc_wi = nullWordProbInReference; // if(matchOOM) pc_wi*=refScalar; SUMwi += /*Math.log(*/(double) fFeatureWords.getTTF(wi) / totalWordsInCorpus / pc_wi/*)*/; SUMfwi += (double) fFeatureWords.getTTF(wi); } double TD = SUMwi / T; double TC = (T * Math.log10(ttf + 1) * ttf) / (SUMfwi + 1); if (T == 1) score = 0.9 * TD + 0.1 * TC; else score = alpha * TD + beta * TC; JATETerm term = new JATETerm(tString, score); result.add(term); } Collections.sort(result); LOG.info("Complete"); return result; } }