package uk.ac.shef.dcs.jate.feature; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.jate.MWEMetadata; import org.apache.lucene.analysis.jate.MWEMetadataType; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.solr.search.SolrIndexSearcher; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker; import uk.ac.shef.dcs.jate.util.SolrUtil; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Set; /** * */ public class PositionFeatureWorker extends JATERecursiveTaskWorker<Integer, int[]> { private static final long serialVersionUID = -5304728799852736303L; private static final Logger LOG = Logger.getLogger(PositionFeatureWorker.class.getName()); private JATEProperties properties; private SolrIndexSearcher solrIndexSearcher; private PositionFeature feature; private Set<String> allCandidates; PositionFeatureWorker(JATEProperties properties, List<Integer> docIds, Set<String> allCandidates, SolrIndexSearcher solrIndexSearcher, PositionFeature feature, int maxTasksPerWorker ) { super(docIds, maxTasksPerWorker); this.properties = properties; this.feature = feature; this.solrIndexSearcher = solrIndexSearcher; this.allCandidates = allCandidates; } @Override protected JATERecursiveTaskWorker<Integer, int[]> createInstance(List<Integer> docIds) { return new PositionFeatureWorker(properties, docIds, allCandidates, solrIndexSearcher, feature, maxTasksPerThread ); } @Override protected int[] mergeResult(List<JATERecursiveTaskWorker<Integer, int[]>> jateRecursiveTaskWorkers) { int totalSuccess = 0, total = 0; for (JATERecursiveTaskWorker<Integer, int[]> worker : jateRecursiveTaskWorkers) { int[] rs = worker.join(); totalSuccess += rs[0]; total += rs[1]; } return new int[]{totalSuccess, total}; } @Override protected int[] computeSingleWorker(List<Integer> docIds) { LOG.info("Total docs to process=" + docIds.size()); int count = 0; for (int docId : docIds) { try { Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher); TermsEnum ngramEnum = lookupVector.iterator(); BytesRef luceneTerm = ngramEnum.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = ngramEnum.next(); continue; } String tString = luceneTerm.utf8ToString(); if (!allCandidates.contains(tString)) { luceneTerm = ngramEnum.next(); continue; } PostingsEnum postingsEnum = ngramEnum.postings(null, PostingsEnum.ALL); if (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { //tf in document int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { int pos = postingsEnum.nextPosition(); BytesRef payload = postingsEnum.getPayload(); MWEMetadata metadata = MWEMetadata.deserialize(payload.utf8ToString()); populateFeature(metadata, tString, feature); /*if (totalOccurrence > 1 && tString.equals("1,25-dihydroxy vitamin")) { System.out.println("pos=" + pos + ", " + postingsEnum.startOffset() + "-" + postingsEnum.endOffset() + ", " + metadata.getMetaData(MWEMetadataType.SOURCE_SENTENCE_ID_IN_DOC)); }*/ } //postingsEnum.nextDoc(); } luceneTerm=ngramEnum.next(); } count++; } catch (IOException ioe) { StringBuilder sb = new StringBuilder("Unable to build feature for document id:"); sb.append(docId).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(ioe)); LOG.error(sb.toString()); } catch (JATEException je) { StringBuilder sb = new StringBuilder("Unable to build feature for document id:"); sb.append(docId).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(je)); LOG.error(sb.toString()); } } LOG.debug("progress : " + count + "/" + docIds.size()); return new int[]{count, docIds.size()}; } private void populateFeature(MWEMetadata metadata, String term, PositionFeature feature) { int sourceParIdInDoc = Integer.valueOf(metadata.getMetaData(MWEMetadataType.SOURCE_PARAGRAPH_ID_IN_DOC)); int sourceSentIdInDoc = Integer.valueOf(metadata.getMetaData(MWEMetadataType.SOURCE_SENTENCE_ID_IN_DOC)); int sourceSentIdInPar = Integer.valueOf(metadata.getMetaData(MWEMetadataType.SOURCE_SENTENCE_ID_IN_PARAGRAPH)); int totalParsInDoc = Integer.valueOf(metadata.getMetaData(MWEMetadataType.PARAGRAPHS_IN_DOC)); int totalSentsInDoc = Integer.valueOf(metadata.getMetaData(MWEMetadataType.SENTENCES_IN_DOC)); int totalSentsInPar = Integer.valueOf(metadata.getMetaData(MWEMetadataType.SENTENCES_IN_PARAGRAPH)); if (sourceParIdInDoc == 0) feature.incrementFoundInDocTitles(term); double parDistFromTitle = calculateDistance(sourceParIdInDoc, totalParsInDoc); feature.addParDistFromTitle(term, parDistFromTitle); double sentDistFromTitle = calculateDistance(sourceSentIdInDoc, totalSentsInDoc); feature.addSentDistFromTitle(term, sentDistFromTitle); double sentDistFromPar = calculateDistance(sourceSentIdInPar, totalSentsInPar); feature.addSentDistFromPar(term, sentDistFromPar); } private double calculateDistance(int index, int total) { return index / (double) total; } //this method can be implemented to check if the term contains any elements in the gazetteer. protected void applyGazetteer(Set<String> gazetteer, String term) { //must update with feature.mweHasIndicative... } }