package uk.ac.shef.dcs.jate.feature;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.solr.search.SolrIndexSearcher;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ForkJoinPool;
import org.apache.log4j.Logger;
/**
* <p>This class creates context windows of a given size around candidate terms or words, and counts
* frequency of candidate terms/words appearing in the context windows. Such frequencies can be later
* used to create co-ocurrence statistics. (This is the master class that triggers multi-thread workers
* that actually deals with the computation.)<p>
*
* </p>Due to the nature of term/word order in a sentence, it is very likely that context windows
* can <b>overlap</b>, or in other words, a candidate can appear in multiple contexts and its frequency is double-counted.
*
* </p>For example:
* </br>George was born in the <u>city</u> of <u>Leeds</u> to a rich <u>banker's</u> <u>family</u>.
* Three candidate terms are found In the above sentence: city, Leeds, banker's. Given a window size of 6, i.e.,
* the left 5 plus right 5 tokens around a term, and given that we create windows for each term, we get three windows:
*
* <ul>
* <li>start: George; end: banker's; contains: city, leeds, banker's</li>
* <li>start: was; end: family; contains: city, leeds, banker's, family</li>
* <li>start: of; end: family; contains: city, leeds, banker's, family</li>
* <li>start: of; end: family; contains: leeds, banker's, family</li>
* </ul>
*
* </p>"leeds" and "banker" co-occur with each other in all four contexts, but in fact, their true co-occurrence count
* should be 1, because the three contexts have substantial overlap.
*
* </p><b>This class instead, generates 'lesser overlapping' context windows as follows:</b> generate a context window
* for the first candidate (order of appearance in a sentence) in the sentence. Then continue to generate a context
* window for the next candidate in the same sentence and NOT already included in the previous context window. As an
* example, the following windows are generated for the above sentence:
*
* <ul>
* <li>start: George; end: banker's; contains: city, leeds, banker's</li>
* <li>start: of; end: family; contains: leeds, banker's, family</li>
* </ul>
*
*</p> And in the meantime, we also generate a ContextOverlap class object that keeps details of the previous and the
* following contexts, and the list of terms (multi-set) appearing in the overlapping zone.
*
* </p> This design ensures that
* <ul>
* <li>only two adjacent contexts can overlap with each other</li>
* <li>keep track of terms appearing in overlap and their frequency in the overlapping zone</li>
* </ul>
*
* </p> Thus to calculate the co-occurrence of "leeds" and "banker's", we count the pair's frequency in each context
* they appear in, then deduce their frequency in the overlapping zones, if any.
*
* </p>
* @see ContextOverlap
* @see CooccurrenceFBMaster
* @see FrequencyCtxWindowBasedFBWorker
*/
public class FrequencyCtxWindowBasedFBMaster extends AbstractFeatureBuilder {
private static final Logger LOG = Logger.getLogger(FrequencyCtxWindowBasedFBMaster.class.getName());
private int termOrWord; //0 means term; 1 means word
private int window;
private Map<Integer, List<ContextWindow>> contextLookup;
/**
* @param solrIndexSearcher
* @param properties
* @param existingContextWindows if we want to use context windows already generated by another process
* of FrequencyCtxWindowBasedFBMaster, pass them here. In that case, the existing
* context windows are used as references, within which candidate terms/words
* are searched. Otherwise, context
* windows will be generated, in this case pass null or an empty set
* @param window
* @param termOrWord
*/
public FrequencyCtxWindowBasedFBMaster(SolrIndexSearcher solrIndexSearcher,
JATEProperties properties,
Set<ContextWindow> existingContextWindows,
int window, int termOrWord) {
super(solrIndexSearcher, properties);
this.termOrWord = termOrWord;
this.window = window;
if (existingContextWindows != null) {
contextLookup=new HashMap<>();
for (ContextWindow ctx : existingContextWindows) {
List<ContextWindow> container = contextLookup.get(ctx.getDocId());
if (container == null)
container = new ArrayList<>();
container.add(ctx);
contextLookup.put(ctx.getDocId(), container);
}
}
}
@Override
public AbstractFeature build() throws JATEException {
FrequencyCtxBased feature = new FrequencyCtxBased();
List<Integer> allDocs = new ArrayList<>();
for (int i = 0; i < solrIndexSearcher.maxDoc(); i++) {
allDocs.add(i);
}
try {
Set<String> allCandidates;
if (termOrWord == 0)
allCandidates = getUniqueTerms();
else
allCandidates = getUniqueWords();
//start workers
int cores = properties.getMaxCPUCores();
cores = cores == 0 ? 1 : cores;
int maxPerThread = allDocs.size() / cores;
maxPerThread = getMaxPerThread(maxPerThread);
FrequencyCtxWindowBasedFBWorker worker = new
FrequencyCtxWindowBasedFBWorker(feature, properties, allDocs, allCandidates,
solrIndexSearcher,
contextLookup,
window, maxPerThread
);
StringBuilder sb = new StringBuilder("Building features using cpu cores=");
sb.append(cores).append(", total docs=").append(allDocs.size()).append(", max per worker=")
.append(maxPerThread);
LOG.info(sb.toString());
ForkJoinPool forkJoinPool = new ForkJoinPool(cores);
int total = forkJoinPool.invoke(worker);
sb = new StringBuilder("Complete building features. Total sentence ctx=");
sb.append(feature.getMapCtx2TTF().size()).append(", from total processed docs=").append(total);
LOG.info(sb.toString());
} catch (IOException ioe) {
StringBuilder sb = new StringBuilder("Failed to build features!");
sb.append("\n").append(ExceptionUtils.getFullStackTrace(ioe));
LOG.error(sb.toString());
throw new JATEException(sb.toString());
}
return feature;
}
private int getMaxPerThread(int maxPerThread) {
if (maxPerThread < MIN_SEQUENTIAL_THRESHOLD) {
maxPerThread = MIN_SEQUENTIAL_THRESHOLD;
} else if(maxPerThread > MAX_SEQUENTIAL_THRESHOLD) {
maxPerThread = MAX_SEQUENTIAL_THRESHOLD;
}
return maxPerThread;
}
}