package uk.ac.shef.dcs.jate.feature;
import org.apache.solr.search.SolrIndexSearcher;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import java.util.*;
import java.util.concurrent.ForkJoinPool;
import org.apache.log4j.Logger;
/**
* This class counts pair-wise co-occurrence between two list of candidates, one called <b>target</b> candidate terms
* (which are the terms we want to consider as real domain terms); the other called <b>reference</b> candidate terms
* (lexical items of which we are interested in their co-occurring behaviour with the target candidates). The two
* lists can be identical.
*
* </p> The candidates are provided in the form of FrequencyCtxBased objects. A FrequencyCtxBased object stores a set of
* candidate terms, and the contexts where they appear. An example scenario is described below.
*
* </p>In Chi-Square, we need to calculate co-ocurrence between every target candidate term, and the most frequent n candidate
* terms (reference term). To do so, we create two FrequencyCtxBased objects. The first stores all candidate terms which
* contexts they appear in. The second is a subset of the first, and only stores most frequent n candidates and the
* contexts they appear in.
*
* </p><b>NOTE 1</b>:We must ensure that the first and second FrequencyCtxBased objects use the same context windows.
*
* </p>Next to calculate co-occurrence, we take a target candidate term (t) from the first FrequencyCtxBased object, pair it with a
* reference term (rt) from the second FrequencyCtxBased object. We then find the context windows that both t and rt appear in, denoted
* by X. Given each x (from X), we look up the frequency of t in x, and frequency of rt in x. The co-occurrence frequency is
* therefore the smaller of the two frequencies.
*
* </p><b>NOTE 2</b>:This method works if contexts are mutually exclusive, i.e., they do not have overlap (e.g., a document, or
* a sentence as a context). When contexts are likely to have overlap (e.g., context window of size n around candidate terms),
* co-occurrences of terms appearing in overlap are double-counted. This is corrected by deducting the frequency of the pair
* in the overlap from their total co-occurrence frequency calculated using the above method.
*
*
* @see FrequencyCtxBased
* @see FrequencyCtxDocBasedFBMaster
* @see FrequencyCtxSentenceBasedFBMaster
* @see FrequencyCtxWindowBasedFBMaster
*/
public class CooccurrenceFBMaster extends AbstractFeatureBuilder {
private static final Logger LOG = Logger.getLogger(CooccurrenceFBMaster.class.getName());
private FrequencyCtxBased frequencyCtxBased; //frequency-in-context of target terms
private FrequencyCtxBased ref_frequencyCtxBased; //frequency-in-context of ref terms, i.e. which co-occur with target terms
private FrequencyTermBased frequencyTermBased; //frequency info of target terms
private int minTTF;
private int minTCF;
/**
* setting MAX_TASKS_PER_WORKER (or SEQUENTIAL_THRESHOLD) to a good-in-practice value is a trade-off.
* The documentation for the ForkJoin framework suggests creating parallel subtasks until
* the number of basic computation steps is somewhere over 100 and less than 10,000.
*
* The exact number is not crucial provided you avoid extremes.
*
* @see <a href="http://homes.cs.washington.edu/~djg/teachingMaterials/grossmanSPAC_forkJoinFramework.html"/>
* @see <a href="http://stackoverflow.com/questions/19925820/fork-join-collecting-results/19926423#19926423"/>
*/
private final static int MAX_TASKS_PER_WORKER = 10000;
public CooccurrenceFBMaster(SolrIndexSearcher solrIndexSearcher, JATEProperties properties,
FrequencyTermBased termFeature,
Integer minTTF,
FrequencyCtxBased contextFeature,
FrequencyCtxBased ref_frequencyCtxBased,
Integer minTCF) {
super(solrIndexSearcher, properties);
this.frequencyCtxBased = contextFeature;
this.frequencyTermBased = termFeature;
this.ref_frequencyCtxBased = ref_frequencyCtxBased;
this.minTTF = minTTF;
this.minTCF = minTCF;//only applies to target terms, not reference terms
}
@Override
public AbstractFeature build() throws JATEException {
//MWEMetadata windows where target candidate terms appear. It is possible that many reference terms
//do not appear in these context windows, because reference terms are not identical set to target terms
List<ContextWindow> contextWindows = new ArrayList<>(frequencyCtxBased.getMapCtx2TTF().keySet());
//List<ContextWindow> contextWindows = new ArrayList<>(ref_frequencyCtxBased.getMapCtx2TTF().keySet());
Collections.sort(contextWindows);
//start workers
int cores = properties.getMaxCPUCores();
cores = cores == 0 ? 1 : cores;
int maxPerThread = getMaxPerThread(contextWindows, cores);
StringBuilder sb = new StringBuilder("Building features using cpu cores=");
sb.append(cores).append(", total ctx where reference terms appear =").append(contextWindows.size()).append(", max per worker=")
.append(maxPerThread);
LOG.info(sb.toString());
LOG.info("Filtering candidates with min.ttf=" + minTTF + " min.tcf=" + minTCF);
Set<String> termsPassingPrefilter = new HashSet<>();
for (ContextWindow ctx : contextWindows) {
//now go thru the selected context windows, select target terms that satisfy selection thresholds
Map<String, Integer> termsInContext = frequencyCtxBased.getTFIC(ctx);
if (minTTF == 0 && minTCF == 0)
termsPassingPrefilter.addAll(termsInContext.keySet());
else {
for (String term : termsInContext.keySet()) {
if (frequencyTermBased.getTTF(term) >= minTTF && frequencyCtxBased.getContexts(term).size() >= minTCF)
termsPassingPrefilter.add(term);
}
}
}
//It is possible that many reference terms
//do not appear in these context windows, because reference terms are not identical set to target terms
Cooccurrence feature = new Cooccurrence(termsPassingPrefilter.size(),
ref_frequencyCtxBased.getMapTerm2Ctx().size());
LOG.info("Beginning building features. Total terms=" + termsPassingPrefilter.size() + ", total contexts=" + contextWindows.size());
CooccurrenceFBWorker worker = new
CooccurrenceFBWorker(feature, contextWindows,
frequencyTermBased, minTTF, frequencyCtxBased, ref_frequencyCtxBased,
minTCF, maxPerThread);
ForkJoinPool forkJoinPool = new ForkJoinPool(cores);
int total = forkJoinPool.invoke(worker);
/*List<String> col=new ArrayList<>(frequencyCtxBased.getCtxOverlapZones().keySet());
col.removeAll(ref_frequencyCtxBased.getCtxOverlapZones().keySet());
System.out.println(col.size());
*/
//post-process to correct double counting in overlapping context
//both target candidate terms and reference candidate terms use the same context objects. So they should also
//have the same context overlaps
Map<String, ContextOverlap> overlaps = frequencyCtxBased.getCtxOverlapZones();
if (overlaps.size() > 0) {
LOG.info("Correcting double counted co-occurrences in context overlapping zones, total zones=" + overlaps.size());
Map<String, Integer> freq = new HashMap<>();
Map<String, Integer> ref_freq = new HashMap<>();
for (Map.Entry<String, ContextOverlap> en : overlaps.entrySet()) {
String key = en.getKey();
ContextOverlap co = en.getValue();
//a map of unique target terms found in this overlap zone and their frequencies
freq.clear();
for (String t : co.getTerms()) {
Integer f = freq.get(t);
f = f == null ? 0 : f;
f++;
freq.put(t, f);
}
if (freq.size() <= 1)
continue;
//get the corresponding context overlap object created for the reference terms
ContextOverlap ref_co = ref_frequencyCtxBased.getCtxOverlapZones().get(key);
//a map of unique reference terms and their frequency within the overlap zone for ref terms
ref_freq.clear();
if (ref_co != null) {
for (String t : ref_co.getTerms()) {
Integer f = ref_freq.get(t);
f = f == null ? 0 : f;
f++;
ref_freq.put(t, f);
}
}
if (ref_freq.size() <= 1)
continue;
//now revise co-occurrence stats
for (Map.Entry<String, Integer> term_in_co : freq.entrySet()) {
int f = term_in_co.getValue(); //target term
for (Map.Entry<String, Integer> term_in_ref_co : ref_freq.entrySet()) {
int rf = term_in_ref_co.getValue(); //reference term
if(term_in_co.getKey().equals(term_in_ref_co.getKey()))
continue;
int deduce = f < rf ? f : rf;
int tid = feature.lookupTerm(term_in_co.getKey()); //get index of target term
int tid_f = feature.lookupRefTerm(term_in_ref_co.getKey()); //get index of reference term
if(tid==-1||tid_f==-1)
continue;// after tracking terms in overlapping zones coocurrence stat workers
// may have filtered some terms so they do not exist in the index
feature.deduce(tid, tid_f, deduce);
}
}
}
}
sb = new StringBuilder("Complete building features, total contexts processed=" + total);
sb.append("; total indexed candidate terms=").append(feature.termCounter).append(";")
.append(" total indexed reference terms=").append(feature.ctxTermCounter);
LOG.info(sb.toString());
return feature;
}
private int getMaxPerThread(List<ContextWindow> contextWindows, int cores) {
int maxPerThread = contextWindows.size() / cores;
if (maxPerThread < MIN_SEQUENTIAL_THRESHOLD) {
maxPerThread = MIN_SEQUENTIAL_THRESHOLD;
} else if (maxPerThread > MAX_SEQUENTIAL_THRESHOLD){
maxPerThread = MAX_SEQUENTIAL_THRESHOLD;
}
return maxPerThread;
}
}