package uk.ac.shef.dcs.jate.algorithm; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.feature.AbstractFeature; import uk.ac.shef.dcs.jate.feature.ContextWindow; import uk.ac.shef.dcs.jate.feature.FrequencyCtxBased; import uk.ac.shef.dcs.jate.feature.FrequencyTermBased; import uk.ac.shef.dcs.jate.model.JATETerm; import java.util.*; import org.apache.log4j.Logger; /** * An implementation of the TermEx term recognition algorithm. See Sclano e. al 2007, <i> * TermExtractor: a Web application to learn the shared terminology of emergent web communities</i> * <p> * In the formula w(t,Di ) =a* DR + B* DC + Y* LC, default values of a, B, and Y are 0.33. * </p> * <p> * This is the implementation of the scoring formula <b>only</b> and does not include the analysis of document structure * as discussed in the paper. */ public class TermEx extends ReferenceBased { private static final Logger LOG = Logger.getLogger(TermEx.class.getName()); private final double alpha; private final double beta; private final double zeta; public static final String SUFFIX_REF = "_REF"; public static final String SUFFIX_WORD = "_WORD"; public static final String SUFFIX_DOC = "_DOC"; public TermEx() { this(0.33, 0.33, 0.34, true); } public TermEx(double alpha, double beta, double zeta, boolean matchOOM) { super(matchOOM); this.alpha = alpha; this.beta = beta; this.zeta = zeta; } @Override public List<JATETerm> execute(Collection<String> candidates) throws JATEException { AbstractFeature feature = features.get(FrequencyTermBased.class.getName()); validateFeature(feature, FrequencyTermBased.class); FrequencyTermBased fFeatureTerms = (FrequencyTermBased) feature; AbstractFeature feature2 = features.get(FrequencyTermBased.class.getName() + SUFFIX_WORD); validateFeature(feature2, FrequencyTermBased.class); FrequencyTermBased fFeatureWords = (FrequencyTermBased) feature2; AbstractFeature feature4 = features.get(FrequencyCtxBased.class.getName() + SUFFIX_DOC); validateFeature(feature4, FrequencyCtxBased.class); FrequencyCtxBased fFeatureDocs = (FrequencyCtxBased) feature4; List<FrequencyTermBased> referenceFeatures = new ArrayList<>(); Map<FrequencyTermBased, Double> mapNullWordProbInReference = new HashMap<>(); Map<FrequencyTermBased, Double> mapRefScalars = new HashMap<>(); for (Map.Entry<String, AbstractFeature> en : features.entrySet()) { if (en.getKey().startsWith(FrequencyTermBased.class.getName() + SUFFIX_REF)) { validateFeature(en.getValue(), FrequencyTermBased.class); FrequencyTermBased fFeatureRef = (FrequencyTermBased) en.getValue(); referenceFeatures.add(fFeatureRef); mapNullWordProbInReference.put(fFeatureRef, setNullWordProbInReference(fFeatureRef)); mapRefScalars.put(fFeatureRef, matchOrdersOfMagnitude(fFeatureWords, fFeatureRef)); } } List<JATETerm> result = new ArrayList<>(); double totalWordsInCorpus = fFeatureWords.getCorpusTotal(); StringBuilder msg = new StringBuilder("Beginning computing TermEx values,"); msg.append(", total terms=" + candidates.size()); LOG.info(msg.toString()); for (String tString : candidates) { String[] elements = tString.split(" "); double T = (double) elements.length; double SUMfwi = 0.0; //the original paper looks up the term directly (tString). But in many case, technical terms //are unlikely to be found in reference corpus. So we break term into component words and look up words //then combine the scores double DP_upper=0.0, DP_lower=0.0, totalWordsInSelectedRefCorpus=0.0; for (int i = 0; i < T; i++) { String wi = elements[i]; SUMfwi += (double) fFeatureWords.getTTF(wi); DP_upper+=(double) fFeatureWords.getTTF(wi); double max_freq_t_dj = 0; FrequencyTermBased selectedRef = referenceFeatures.get(0); for (FrequencyTermBased refFeature : referenceFeatures) { double freqNorm = refFeature.getTTF(wi); if (freqNorm > max_freq_t_dj) { max_freq_t_dj = freqNorm; selectedRef = refFeature; totalWordsInSelectedRefCorpus=refFeature.getCorpusTotal(); } } if (max_freq_t_dj == 0) max_freq_t_dj = mapNullWordProbInReference.get(selectedRef); double theRefScalar = mapRefScalars.get(selectedRef); if(matchOOM) max_freq_t_dj *= theRefScalar; DP_lower+=max_freq_t_dj; } //calc DC Set<Integer> docs = fFeatureTerms.getTermFrequencyInDocument(tString).keySet(); double sum = 0; for (int i : docs) { //do not query for features using this context window. but query to get the //real context window object that matches this id ContextWindow c = new ContextWindow(); c.setDocId(i); c = fFeatureDocs.getContextWindow(c.toString()); if (c == null) LOG.error(String.format("TermEx error: expected context window does not exist in doc [%s]", i)); int tfid = fFeatureDocs.getTFIC(c).get(tString); int ttfid = fFeatureDocs.getMapCtx2TTF().get(c); double norm = tfid == 0 ? 0 : (double) tfid / ttfid; if (norm == 0) sum += 0; else { sum += norm * Math.log(norm/* + 0.1*/); } } double DP = (DP_upper/totalWordsInCorpus)/(DP_lower/totalWordsInSelectedRefCorpus); //double DP = SUMwi; //this term has been changed to ensure they are in the range of 0 and 1 double DC = 0-sum; double LC = SUMfwi == 0 ? 0 : (T * Math.log(fFeatureTerms.getTTF(tString) + 0.000001) * fFeatureTerms.getTTF(tString)) / SUMfwi; double score = alpha * DP + beta * DC + zeta * LC; JATETerm term = new JATETerm(tString, score); result.add(term); } Collections.sort(result); LOG.info("Complete"); return result; } }