package uk.ac.shef.dcs.jate.feature; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.lucene.analysis.jate.MWEMetadata; import org.apache.lucene.analysis.jate.SentenceContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.solr.search.SolrIndexSearcher; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker; import uk.ac.shef.dcs.jate.util.SolrUtil; import java.io.IOException; import java.util.*; import org.apache.log4j.Logger; /** * */ public class FrequencyCtxSentenceBasedFBWorker extends JATERecursiveTaskWorker<Integer, Integer> { private static final long serialVersionUID = -9172128488678036098L; private static final Logger LOG = Logger.getLogger(FrequencyCtxSentenceBasedFBWorker.class.getName()); private JATEProperties properties; private SolrIndexSearcher solrIndexSearcher; private Set<String> allCandidates; private FrequencyCtxBased feature; public FrequencyCtxSentenceBasedFBWorker(FrequencyCtxBased feature, JATEProperties properties, List<Integer> docIds, Set<String> allCandidates, SolrIndexSearcher solrIndexSearcher, int maxTasksPerWorker) { super(docIds, maxTasksPerWorker); this.properties = properties; this.solrIndexSearcher = solrIndexSearcher; this.allCandidates=allCandidates; this.feature=feature; } @Override protected JATERecursiveTaskWorker<Integer, Integer> createInstance(List<Integer> docIdSplit) { return new FrequencyCtxSentenceBasedFBWorker(feature,properties, docIdSplit, allCandidates, solrIndexSearcher, maxTasksPerThread); } @Override protected Integer mergeResult(List<JATERecursiveTaskWorker<Integer, Integer>> jateRecursiveTaskWorkers) { Integer total=0; for (JATERecursiveTaskWorker<Integer, Integer> worker : jateRecursiveTaskWorkers) { total+= worker.join(); } return total; } @Override protected Integer computeSingleWorker(List<Integer> docIds) { LOG.info("Total docs to process=" + docIds.size()); int count = 0; Set<Integer> sentenceIds=new HashSet<>(); for (int docId : docIds) { count++; try { Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher); if(lookupVector==null){ LOG.error("Term vector for document id="+count+" is null. The document may be empty"); System.err.println("Term vector for document id="+count+" is null. The document may be empty"); continue; } List<MWESentenceContext> terms = collectTermOffsets( lookupVector); for(MWESentenceContext term: terms){ ContextWindow ctx = new ContextWindow(); ctx.setDocId(docId); ctx.setSentenceId(term.sentenceId); feature.increment(ctx,1); feature.increment(ctx, term.string, 1); sentenceIds.add(term.sentenceId); } } catch (IOException ioe) { StringBuilder sb = new StringBuilder("Unable to build feature for document id:"); sb.append(docId).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(ioe)); LOG.error(sb.toString()); } catch (JATEException je) { StringBuilder sb = new StringBuilder("Unable to build feature for document id:"); sb.append(docId).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(je)); LOG.error(sb.toString()); } } if(sentenceIds.size()==1) try { LOG.error("Among "+docIds.size()+" on average each document has only 1 sentence. If this is not expected, check your analyzer chain for your Solr field " +properties.getSolrFieldNameJATENGramInfo()+" (OpenNLPTokenizerFactory) if SentenceContext has been produced corrected."); } catch (JATEException e) { } return count; } private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException { List<MWESentenceContext> result = new ArrayList<>(); TermsEnum tiRef= termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if(!allCandidates.contains(tString)) { luceneTerm=tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload=postingsEnum.getPayload(); int sentenceId=-1; if(payload!=null){ sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId(); } result.add(new MWESentenceContext(tString,sentenceId, start, end)); } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; } private class MWESentenceContext implements Comparable<MWESentenceContext> { public String string; public int sentenceId; public int start; public int end; public MWESentenceContext(String string, int sentenceId, int start, int end) { this.string=string; this.sentenceId = sentenceId; this.start = start; this.end = end; } @Override public int compareTo(MWESentenceContext o) { int compare = Integer.valueOf(start).compareTo(o.start); if (compare == 0) { return Integer.valueOf(end).compareTo(o.end); } return compare; } public String toString() { return sentenceId + "," + start+","+end; } } }