package uk.ac.shef.dcs.jate.feature; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.lucene.index.*; import org.apache.lucene.util.BytesRef; import org.apache.solr.search.SolrIndexSearcher; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker; import java.io.IOException; import java.util.List; import org.apache.log4j.Logger; /** * */ public class FrequencyCtxDocBasedFBWorker extends JATERecursiveTaskWorker<String, Integer> { private static final long serialVersionUID = 8978235926472578074L; private static final Logger LOG = Logger.getLogger(FrequencyCtxDocBasedFBWorker.class.getName()); private JATEProperties properties; private SolrIndexSearcher solrIndexSearcher; private Terms ngramInfo; private FrequencyCtxBased feature; FrequencyCtxDocBasedFBWorker(FrequencyCtxBased feature, JATEProperties properties, List<String> luceneTerms, SolrIndexSearcher solrIndexSearcher, int maxTasksPerWorker, Terms ngramInfo) { super(luceneTerms, maxTasksPerWorker); this.feature=feature; this.properties = properties; this.solrIndexSearcher = solrIndexSearcher; this.ngramInfo =ngramInfo; } @Override protected JATERecursiveTaskWorker<String, Integer> createInstance(List<String> termSplits) { return new FrequencyCtxDocBasedFBWorker(feature, properties, termSplits, solrIndexSearcher, maxTasksPerThread, ngramInfo); } @Override protected Integer mergeResult(List<JATERecursiveTaskWorker<String, Integer>> jateRecursiveTaskWorkers) { Integer total=0; for (JATERecursiveTaskWorker<String, Integer> worker : jateRecursiveTaskWorkers) { total+= worker.join(); } return total; } @Override protected Integer computeSingleWorker(List<String> terms) { int total=0; TermsEnum ngramInfoIterator; try { ngramInfoIterator = ngramInfo.iterator(); for (String termStr : terms) { try { if (ngramInfoIterator.seekExact(new BytesRef(termStr.getBytes("UTF-8")))) { PostingsEnum docEnum = ngramInfoIterator.postings(null); int doc = 0; while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { int tfid = docEnum.freq(); //tf in document ContextWindow ctx = new ContextWindow(); ctx.setDocId(doc); feature.increment(ctx, tfid); feature.increment(ctx, termStr, tfid); } total++; }else { StringBuilder msg = new StringBuilder(termStr); msg.append(" is a candidate term, but not indexed in the n-gram information field. It's score may be mis-computed."); msg.append(" Reasons can be: different analysis chains for the two fields; cross-sentence-boundary MWEs"); LOG.warn(msg.toString()); } } catch (IOException ioe) { StringBuilder sb = new StringBuilder("Unable to build feature for candidate:"); sb.append(termStr).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(ioe)); LOG.error(sb.toString()); } } } catch (IOException e) { StringBuilder sb = new StringBuilder("Unable to read ngram information field:"); sb.append(ExceptionUtils.getFullStackTrace(e)); LOG.error(sb.toString()); } return total; } }