package uk.ac.shef.dcs.jate.feature;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.search.SolrIndexSearcher;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker;
import java.io.IOException;
import java.util.List;
import org.apache.log4j.Logger;
class FrequencyTermBasedFBWorker extends JATERecursiveTaskWorker<String, int[]> {
private static final long serialVersionUID = -5304721004951728503L;
private static final Logger LOG = Logger.getLogger(FrequencyTermBasedFBWorker.class.getName());
private JATEProperties properties;
private SolrIndexSearcher solrIndexSearcher;
private FrequencyTermBased feature;
private Terms ngramInfo;
FrequencyTermBasedFBWorker(JATEProperties properties, List<String> luceneTerms, SolrIndexSearcher solrIndexSearcher,
FrequencyTermBased feature, int maxTasksPerWorker,
Terms ngramInfo) {
super(luceneTerms, maxTasksPerWorker);
this.properties = properties;
this.feature = feature;
this.solrIndexSearcher = solrIndexSearcher;
this.ngramInfo = ngramInfo;
}
@Override
protected JATERecursiveTaskWorker<String, int[]> createInstance(List<String> termSplit) {
return new FrequencyTermBasedFBWorker(properties, termSplit, solrIndexSearcher, feature, maxTasksPerThread,
ngramInfo);
}
@Override
protected int[] mergeResult(List<JATERecursiveTaskWorker<String, int[]>> jateRecursiveTaskWorkers) {
int totalSuccess = 0, total = 0;
for (JATERecursiveTaskWorker<String, int[]> worker : jateRecursiveTaskWorkers) {
int[] rs = worker.join();
totalSuccess += rs[0];
total += rs[1];
}
return new int[]{totalSuccess, total};
}
@Override
protected int[] computeSingleWorker(List<String> terms) {
int totalSuccess = 0;
TermsEnum ngramInfoIterator;
try {
ngramInfoIterator = ngramInfo.iterator();
for (String term : terms) {
try {
if (ngramInfoIterator.seekExact(new BytesRef(term.getBytes("UTF-8")))) {
PostingsEnum docEnum = ngramInfoIterator.postings(null);
int doc = 0;
while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
//tf in document
int tfid = docEnum.freq();
feature.increment(term, tfid);
feature.incrementTermFrequencyInDocument(term, doc, tfid);
}
totalSuccess++;
} else {
String warning = String.format("'%s' is a candidate term, but not indexed in the n-gram " +
"information field. It's score may be mis-computed. You may have used different text " +
"analysis process (e.g., different tokenizers, different analysis order, limited " +
"n-gram range) for the text-2-candidate-term and text-2-ngram fields.) ", term);
LOG.warn(warning);
}
} catch (IOException ioe) {
String error = String.format("Unable to build feature for candidate: '%s'. \\n Exception: %s",
term, ExceptionUtils.getFullStackTrace(ioe));
LOG.error(error.toString());
}
}
} catch (IOException ioe) {
String error = String.format("Unable to read ngram information field:. \\n Exception: %s",
ExceptionUtils.getFullStackTrace(ioe));
LOG.error(error);
}
LOG.debug("progress : " + totalSuccess + "/" + terms.size());
return new int[]{totalSuccess, terms.size()};
}
}