package uk.ac.shef.dcs.jate.algorithm; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.feature.AbstractFeature; import uk.ac.shef.dcs.jate.feature.FrequencyTermBased; import uk.ac.shef.dcs.jate.model.JATETerm; import java.util.*; import org.apache.log4j.Logger; /** * An implementation of the word weirdness algorithm applied to term recognition algorithm. See * Ahmad et al 1999, <i>Surrey Participation in TREC8: Weirdness Indexing for Logical Document Extrapolation * and Retrieval</i> */ public class Weirdness extends ReferenceBased { private static final Logger LOG = Logger.getLogger(Weirdness.class.getName()); public static final String SUFFIX_REF ="_REF"; public static final String SUFFIX_WORD ="_WORD"; public Weirdness(){ super(true); } public Weirdness(boolean matchOOM){ super(matchOOM); } @Override public List<JATETerm> execute(Collection<String> candidates) throws JATEException { AbstractFeature feature1 = features.get(FrequencyTermBased.class.getName()+SUFFIX_WORD); validateFeature(feature1, FrequencyTermBased.class); FrequencyTermBased fFeatureWords = (FrequencyTermBased) feature1; AbstractFeature feature2 = features.get(FrequencyTermBased.class.getName()+ SUFFIX_REF); validateFeature(feature2, FrequencyTermBased.class); FrequencyTermBased fFeatureRef = (FrequencyTermBased) feature2; List<JATETerm> result = new ArrayList<>(); double totalWordsInCorpus = fFeatureWords.getCorpusTotal(); StringBuilder msg = new StringBuilder("Beginning computing Weirdness values,"); msg.append(", total terms=" + candidates.size()); LOG.info(msg.toString()); nullWordProbInReference = setNullWordProbInReference(fFeatureRef); double refScalar = matchOrdersOfMagnitude(fFeatureWords, fFeatureRef); for(String tString: candidates) { JATETerm term = new JATETerm(tString); String[] elements = tString.split(" "); double T = (double) elements.length; double SUMwi = 0.0; for (int i = 0; i < T; i++) { String wi = elements[i]; double pc_wi = fFeatureRef.getTTFNorm(wi); if (pc_wi == 0) pc_wi = nullWordProbInReference; // if(matchOOM) pc_wi*=refScalar; int freq=fFeatureWords.getTTF(wi); if(freq==0) continue;//composing words can be stopwords and no frequency will be recorded double v = (double) freq / totalWordsInCorpus / pc_wi; //SUMwi += Math.log((double) gFeatureStore.getWordFreq(wi) / (double) gFeatureStore.getTotalCorpusWordFreq() / gFeatureStore.getRefWordFreqNorm(wi)); SUMwi += Math.log(v); } double TD = SUMwi / T; term.setScore(TD); result.add(term); } Collections.sort(result); LOG.info("Complete"); return result; } }