package uk.ac.shef.dcs.jate.feature;
import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker;
import java.util.*;
import org.apache.log4j.Logger;
/**
* AN IMPORTANT ASSUMPTION IS THAT context id disjoint, e.g., sentences. In window-based context, coocurrence can be
* double-counted. This is corrected by the CooccurrenceFBMaster class as a post-process.
*
* @see CooccurrenceFBMaster
*/
public class CooccurrenceFBWorker extends JATERecursiveTaskWorker<ContextWindow, Integer> {
private static final long serialVersionUID = 2618520228983802927L;
private static final Logger LOG = Logger.getLogger(CooccurrenceFBWorker.class.getName());
private final FrequencyCtxBased frequencyCtxBased;
private final FrequencyTermBased frequencyTermBased;
private final FrequencyCtxBased ref_frequencyCtxBased;
private final int minTTF;
private final int minTCF;
protected final Cooccurrence feature;
public CooccurrenceFBWorker(Cooccurrence feature, List<ContextWindow> contextWindowIds,
FrequencyTermBased frequencyTermBased,
int minTTF,
FrequencyCtxBased frequencyCtxBased,
FrequencyCtxBased ref_frequencyCtxBased,
int minTCF,
int maxTasksPerWorker) {
super(contextWindowIds, maxTasksPerWorker);
this.feature=feature;
this.frequencyCtxBased = frequencyCtxBased;
this.frequencyTermBased = frequencyTermBased;
this.ref_frequencyCtxBased=ref_frequencyCtxBased;
this.minTTF = minTTF;
this.minTCF = minTCF;
}
@Override
protected JATERecursiveTaskWorker<ContextWindow, Integer> createInstance(List<ContextWindow> contextWindowIdSplit) {
return new CooccurrenceFBWorker(feature, contextWindowIdSplit,frequencyTermBased,
minTTF, frequencyCtxBased, ref_frequencyCtxBased, minTCF, maxTasksPerThread);
}
@Override
protected Integer mergeResult(List<JATERecursiveTaskWorker<ContextWindow, Integer>> jateRecursiveTaskWorkers) {
Integer total=0;
for (JATERecursiveTaskWorker<ContextWindow, Integer> worker : jateRecursiveTaskWorkers) {
total+= worker.join();
}
return total;
}
@Override
protected Integer computeSingleWorker(List<ContextWindow> contextWindows) {
StringBuilder sb = new StringBuilder("Total ctx to process=");
sb.append(contextWindows.size())
.append(", total ref terms=").append(ref_frequencyCtxBased.getMapTerm2Ctx().size());
LOG.info(sb.toString());
int total=0;
int totalTermsInContext=0, totalRefTermsInContext=0;
List<String> termsInContext = new ArrayList<>();
for (ContextWindow ctx : contextWindows) {
termsInContext.clear();
//get the reference terms appearing in this ctx object and their frequency
Map<String, Integer> refTerm2TFIC=ref_frequencyCtxBased.getTFIC(ctx);
/*if(refTerm2TFIC.size()==0) //it is possible because ref-term may not appear in this context
continue;*/
//get the target terms appearing in this ctx object and their frequency
Map<String, Integer> term2TFIC = frequencyCtxBased.getTFIC(ctx);
//all terms in this ctxid
//NOTE!!!: it is possible that there are no target terms in this context, due to target term filtering
// As a result, the actual indexed reference terms in this co-occurrence feature may not be identical
// to ref_frequencyCtxBased.getMapTerm2CtxId().
termsInContext.addAll(term2TFIC.keySet());
totalTermsInContext+=termsInContext.size();
totalRefTermsInContext+=refTerm2TFIC.size();
for (String targetTerm : termsInContext) {
if ((minTTF > 0 && frequencyTermBased.getTTF(targetTerm) < minTTF)
|| (minTCF > 0 && frequencyCtxBased.getContexts(targetTerm).size() < minTCF))
continue;
int targetFIC = term2TFIC.get(targetTerm); //frequency of term in this context
int targetIdx = feature.lookupAndIndexTerm(targetTerm);
//now go through each reference term to be considered and check cooccurrence:
for (Map.Entry<String, Integer> en : refTerm2TFIC.entrySet()) {
String refTerm = en.getKey();
if (refTerm.equals(targetTerm))
continue;
int refTermFIC = en.getValue();
int refIdx = feature.lookupAndIndexRefTerm(refTerm);
int coocurringFreq = targetFIC < refTermFIC ? targetFIC : refTermFIC;
feature.increment(targetIdx, refIdx, coocurringFreq);
}
}
total++;
//debug
if(total%100000==0){
LOG.info(total + "/" + contextWindows.size() + " (t=" + totalTermsInContext + " x reft=" + totalRefTermsInContext+")");
totalTermsInContext=0;
totalRefTermsInContext=0;
}
}
LOG.info("complete calculation for one worker: total:" + total);
return total;
}
}