package uk.ac.shef.dcs.jate.feature; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.lucene.analysis.jate.MWEMetadata; import org.apache.lucene.analysis.jate.SentenceContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.solr.search.SolrIndexSearcher; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker; import uk.ac.shef.dcs.jate.util.SolrUtil; import java.io.IOException; import java.util.*; import org.apache.log4j.Logger; /** * A containment relationship between a candidate term and a context window can be partial. I.e., as long as a candidate term's * start of end token is included in the context window the candidate term is considered to be 'contained' by the context window. * * Frequencies in context can be calculated in two different modes. * * </p><b>Mode 1</b>: context windows are generated based on candidate terms. The method these are generated is described * in FrequencyCtxWindowBasedFBMaster. * * </p><Mode 2</b>: context windows are provided (possibly generated by another process already), and the goal is to use * these windows as-is and count candidate term frequencies in these windows. One scenario that this mode is useful * is when you want to compare the co-occurrences of candidate terms (e.g., phrases) with 'reference' words (e.g., adjectives), * such as in the case of NC-value (Frantzi 2000). In this case you want to generate context windows based on candidate terms, * but are also interested in what reference words appear in which candidate term contexts (and you do not want to * generate context windows around reference words, as they will be different). * * * </p>Reference: Katerina Frantzi, Sophia Ananiadou, Hideki Mima. 2000. Automatic recognition of multi-word terms: * the C-value/NC-value method. Natural Language Processing For Digital Libraries International Journal on Digital * Libraries August 2000, Volume 3, Issue 2, pp 115-130 */ class FrequencyCtxWindowBasedFBWorker extends JATERecursiveTaskWorker<Integer, Integer> { private static final long serialVersionUID = -9172128488678036089L; private static final Logger LOG = Logger.getLogger(FrequencyCtxWindowBasedFBWorker.class.getName()); private JATEProperties properties; private SolrIndexSearcher solrIndexSearcher; private Set<String> allCandidates; private FrequencyCtxBased feature; private int window; private Map<Integer, List<ContextWindow>> contextLookup;//set of contexts in which we should count term frequencies /** * @param feature * @param properties * @param docIds * @param allCandidates * @param solrIndexSearcher * @param contextLookup set of contexts in which we should count term frequencies. key:docid+","+sentenceid; * value: MWEMetadata objects found in that doc and sentence pair. If the contexts * should be generated, used null or an empty map * @param window * @param maxTasksPerWorker */ public FrequencyCtxWindowBasedFBWorker(FrequencyCtxBased feature, JATEProperties properties, List<Integer> docIds, Set<String> allCandidates, SolrIndexSearcher solrIndexSearcher, Map<Integer, List<ContextWindow>> contextLookup, int window, int maxTasksPerWorker) { super(docIds, maxTasksPerWorker); this.properties = properties; this.solrIndexSearcher = solrIndexSearcher; this.allCandidates = allCandidates; this.feature = feature; this.window = window; this.contextLookup = contextLookup; } @Override protected JATERecursiveTaskWorker<Integer, Integer> createInstance(List<Integer> docIdSplit) { return new FrequencyCtxWindowBasedFBWorker(feature, properties, docIdSplit, allCandidates, solrIndexSearcher, contextLookup, window, maxTasksPerThread); } @Override protected Integer mergeResult(List<JATERecursiveTaskWorker<Integer, Integer>> jateRecursiveTaskWorkers) { Integer total = 0; for (JATERecursiveTaskWorker<Integer, Integer> worker : jateRecursiveTaskWorkers) { total += worker.join(); } return total; } @Override protected Integer computeSingleWorker(List<Integer> docIds) { LOG.info("Total docs to process=" + docIds.size()); if (contextLookup == null || contextLookup.size() == 0) return generateNewContexts(docIds); else { return useExistingContexts(docIds); } } /** * Use existing context windows to count term/word frequency within contexts. * MWEMetadata overlap zones are generated for adjacent context windows. * * @param docIds * @return */ private int useExistingContexts(List<Integer> docIds) { int count = 0; //Set<Integer> firstTokenIndexes = new HashSet<>(); for (int docId : docIds) { count++; try { //get the terms in this document, ordered by sentence id, then its index position in sentence Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher); List<MWEInSentence> terms = collectTermSentenceContext( lookupVector, new HashMap<>()); List<ContextWindow> contexts_in_doc = contextLookup.get(docId); if (contexts_in_doc == null || contexts_in_doc.size() == 0) continue; //context windows are now should be sorted by sentence id, then start tok index, then end tok index Collections.sort(contexts_in_doc); //mwecontext also sorted by sentence id, then start tok index, then end tok index int cursor = 0;//cursor to point to the position in the list of terms that have been processed ContextWindow prevCtx = null; //go thru each context window, compute term frequency within that window for (ContextWindow ctx : contexts_in_doc) { ContextOverlap co = null; if (prevCtx != null && prevCtx.getSentenceId() == ctx.getSentenceId()) {//does current context overlap with previous //calculate context overlap if (prevCtx.getLastTok() >= ctx.getFirstTok()) { co = new ContextOverlap(prevCtx, ctx, new ArrayList<>()); } } /*if(ctx.getDocId()==399&&ctx.getSentenceId()==1&&ctx.getFirstTok()==13&&ctx.getLastTok()==23) System.out.println("stop");*/ int indexFirstIncludedTermByContext = -1; for (int i = cursor; i < terms.size(); i++) {//starting from the term pointed by the cursor //window to check MWEInSentence t = terms.get(i); if (ctx.getSentenceId() < t.sentenceId) {//term is in the next sentence to the context's containing sentence //no terms will be found in the current context, so move on to the next context cursor = indexFirstIncludedTermByContext; break; } else if (ctx.getSentenceId() > t.sentenceId) {//term is in the previous sentence to the context's sentence //should move on to the first term that is in the same sentence of the context continue; } //term is in the same context of the sentence; next, check is t within this context? boolean outOfContext = false; if ((t.firstTokenIndex >= ctx.getFirstTok() && t.firstTokenIndex <= ctx.getLastTok()) || t.lastTokenIndex >= ctx.getFirstTok() && t.lastTokenIndex <= ctx.getLastTok()) { //containment can be partial feature.increment(ctx, 1); feature.increment(ctx, t.string, 1); if (indexFirstIncludedTermByContext == -1) indexFirstIncludedTermByContext = i; } else if (t.lastTokenIndex < ctx.getFirstTok()) { //term to the left of the context, continue the term list to search //for term included in this context continue; } else { outOfContext = true; if (indexFirstIncludedTermByContext != -1) cursor = indexFirstIncludedTermByContext; } //is t within a context overlap? if (co != null) { if ((co.getPrevContext().getLastTok() >= t.firstTokenIndex && co.getNextContext().getFirstTok() <= t.firstTokenIndex) || (co.getPrevContext().getLastTok() >= t.lastTokenIndex && co.getNextContext().getFirstTok() <= t.lastTokenIndex) || (co.getPrevContext().getFirstTok() <= t.firstTokenIndex && co.getPrevContext().getLastTok() >= t.firstTokenIndex && co.getNextContext().getLastTok() >= t.lastTokenIndex && co.getNextContext().getFirstTok() <= t.lastTokenIndex)) { co.getTerms().add(t.string); } } if (outOfContext) break; } prevCtx = ctx; if (co != null && co.getTerms().size() > 0) feature.addCtxOverlapZone(co); } } catch (IOException | JATEException ioe) { StringBuilder sb = new StringBuilder("Unable to build feature for document id:"); sb.append(docId).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(ioe)); LOG.error(sb.toString()); } } //LOG.info("debug---finished"); return count; } private int generateNewContexts(List<Integer> docIds) { int count = 0; Set<Integer> firstTokenIndexes = new HashSet<>(); for (int docId : docIds) { count++; try { //get all terms in the document Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher); Map<Integer, Integer> sentenceBoundaries = new HashMap<>(); //terms are now sorted by sentence id, then first tok index in sentence, then last tok index in sentence List<MWEInSentence> terms = collectTermSentenceContext( lookupVector, sentenceBoundaries); int lastToken = -1; int currSentenceId = -1, currWindowStart = -1, currWindowEnd = -1; ContextWindow prevCtx = null; List<Integer> prevWindowRight = new ArrayList<>(); //to keep indexes of terms that appear on the right half // of the window context for (int i = 0; i < terms.size(); i++) { MWEInSentence term = terms.get(i); firstTokenIndexes.add(term.firstTokenIndex); //init for a sentence if (currSentenceId == -1 || (currSentenceId != -1 && term.sentenceId != currSentenceId)) {//if new sentence, reset window parameters currSentenceId = term.sentenceId; currWindowStart = -1; currWindowEnd = -1; lastToken = sentenceBoundaries.get(currSentenceId); } if (term.firstTokenIndex >= currWindowStart && term.firstTokenIndex <= currWindowEnd) continue;//the term is included in the current window, it should have been counted //create window based on this term, and check its context currWindowStart = term.firstTokenIndex - window; if (currWindowStart < 0) currWindowStart = 0; currWindowEnd = term.lastTokenIndex + window; if (currWindowEnd >= lastToken) currWindowEnd = lastToken; /*if (currWindowStart > currWindowEnd) System.out.println();*/ ContextWindow ctx = new ContextWindow(); ctx.setDocId(docId); ctx.setSentenceId(currSentenceId); ctx.setFirstTok(currWindowStart); ctx.setLastTok(currWindowEnd); /*if (docId == 399 && currSentenceId == 1 && currWindowStart == 5 && currWindowEnd == 17) System.out.println("stop"); if (docId == 399 && currSentenceId == 1 && currWindowStart == 13 && currWindowEnd == 23) System.out.println("stop");*/ feature.increment(ctx, 1); feature.increment(ctx, term.string, 1); //previous j tokens List<String> termsInOverlap = new ArrayList<>(); List<Integer> currentWindowRight=new ArrayList<>(); for (int j = i - 1; j > -1; j--) { MWEInSentence prevTerm = terms.get(j); if (prevWindowRight.size() > 0) { //if we have moved back passing the the leftmost term in the //// previous window's right half, stop. This is to ensure minimum overlap if (j < prevWindowRight.get(0)) break; } else if (prevTerm.lastTokenIndex < currWindowStart || prevTerm.sentenceId != ctx.getSentenceId()) break; if ((prevTerm.firstTokenIndex >= ctx.getFirstTok() && prevTerm.firstTokenIndex <= ctx.getLastTok()) || (prevTerm.lastTokenIndex >= ctx.getFirstTok() && prevTerm.lastTokenIndex <= ctx.getLastTok())) { feature.increment(ctx, 1); feature.increment(ctx, prevTerm.string, 1); if (prevWindowRight.contains(j)) { //if any term in the left half of current term's context window is also found in //the previous term's right half of context window, they are in overlap termsInOverlap.add(prevTerm.string); } } if(prevTerm.sentenceId==term.sentenceId&&prevTerm.lastTokenIndex>term.lastTokenIndex) //update terms that appear in the right half of //current term's context. A term appearing to the left of the current term can span //across the current term to finish on the right of the current term currentWindowRight.add(j); } if (prevCtx != null && prevCtx.getSentenceId()==ctx.getSentenceId()&& termsInOverlap.size() > 0 && prevCtx.getLastTok() >= ctx.getFirstTok()) { ContextOverlap co = new ContextOverlap(prevCtx, ctx, termsInOverlap); feature.addCtxOverlapZone(co); } //following j tokens for (int j = i + 1; j < terms.size(); j++) { i = j - 1; MWEInSentence nextTerm = terms.get(j); if (nextTerm.firstTokenIndex > currWindowEnd || nextTerm.sentenceId != ctx.getSentenceId()) break; feature.increment(ctx, 1); feature.increment(ctx, nextTerm.string, 1); currentWindowRight.add(j);////update terms that appear in the right half of current term's context } prevWindowRight=currentWindowRight; prevCtx = ctx; } } catch (IOException ioe) { StringBuilder sb = new StringBuilder("Unable to build feature for document id:"); sb.append(docId).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(ioe)); LOG.error(sb.toString()); } catch (JATEException je) { StringBuilder sb = new StringBuilder("Unable to build feature for document id:"); sb.append(docId).append("\n"); sb.append(ExceptionUtils.getFullStackTrace(je)); LOG.error(sb.toString()); } } if (firstTokenIndexes.size() / docIds.size() <= 1) try { LOG.warn("Check your analyzer chain for your Solr field " + properties.getSolrFieldNameJATENGramInfo() + " if each token's position in a sentence has been produced."); } catch (JATEException e) { } //LOG.info("debug---finished"); return count; } private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup, Map<Integer, Integer> sentenceBoundaries) throws IOException { List<MWEInSentence> result = new ArrayList<>(); TermsEnum tiRef = termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if (!allCandidates.contains(tString)) { luceneTerm = tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload = postingsEnum.getPayload(); SentenceContext sentenceContextInfo = null; if (payload != null) { sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())); } if (sentenceContextInfo == null) result.add(new MWEInSentence(tString, start, end, 0, 0, 0)); else { result.add(new MWEInSentence(tString, start, end, sentenceContextInfo.getFirstTokenIdx(), sentenceContextInfo.getLastTokenIdx(), sentenceContextInfo.getSentenceId())); Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId()); if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx()) sentenceBoundaries.put(sentenceContextInfo.getSentenceId(), sentenceContextInfo.getLastTokenIdx()); } } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; } private class MWEInSentence implements Comparable<MWEInSentence> { public String string; public int sentenceId; public int firstTokenIndex; public int lastTokenIndex; public int start; public int end; public MWEInSentence(String string, int start, int end, int firstTokenIndex, int lastTokenIndex, int sentenceId) { this.string = string; this.sentenceId = sentenceId; this.start = start; this.end = end; this.firstTokenIndex = firstTokenIndex; this.lastTokenIndex = lastTokenIndex; } @Override public int compareTo(MWEInSentence o) { int compare = Integer.valueOf(sentenceId).compareTo(o.sentenceId); if(compare==0) compare=Integer.valueOf(firstTokenIndex).compareTo(o.firstTokenIndex); if (compare == 0) { return Integer.valueOf(lastTokenIndex).compareTo(o.lastTokenIndex); } return compare; } public String toString() { return "st=" + sentenceId + ",f=" + firstTokenIndex + ",l=" + lastTokenIndex + ",so=" + start + ",se=" + end; } } }