package uk.ac.shef.dcs.jate.feature;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.lucene.analysis.jate.MWEMetadata;
import org.apache.lucene.analysis.jate.SentenceContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.search.SolrIndexSearcher;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker;
import uk.ac.shef.dcs.jate.util.SolrUtil;
import java.io.IOException;
import java.util.*;
import org.apache.log4j.Logger;
/**
* A containment relationship between a candidate term and a context window can be partial. I.e., as long as a candidate term's
* start of end token is included in the context window the candidate term is considered to be 'contained' by the context window.
*
* Frequencies in context can be calculated in two different modes.
*
* </p><b>Mode 1</b>: context windows are generated based on candidate terms. The method these are generated is described
* in FrequencyCtxWindowBasedFBMaster.
*
* </p><Mode 2</b>: context windows are provided (possibly generated by another process already), and the goal is to use
* these windows as-is and count candidate term frequencies in these windows. One scenario that this mode is useful
* is when you want to compare the co-occurrences of candidate terms (e.g., phrases) with 'reference' words (e.g., adjectives),
* such as in the case of NC-value (Frantzi 2000). In this case you want to generate context windows based on candidate terms,
* but are also interested in what reference words appear in which candidate term contexts (and you do not want to
* generate context windows around reference words, as they will be different).
*
*
* </p>Reference: Katerina Frantzi, Sophia Ananiadou, Hideki Mima. 2000. Automatic recognition of multi-word terms:
* the C-value/NC-value method. Natural Language Processing For Digital Libraries International Journal on Digital
* Libraries August 2000, Volume 3, Issue 2, pp 115-130
*/
class FrequencyCtxWindowBasedFBWorker extends JATERecursiveTaskWorker<Integer, Integer> {
private static final long serialVersionUID = -9172128488678036089L;
private static final Logger LOG = Logger.getLogger(FrequencyCtxWindowBasedFBWorker.class.getName());
private JATEProperties properties;
private SolrIndexSearcher solrIndexSearcher;
private Set<String> allCandidates;
private FrequencyCtxBased feature;
private int window;
private Map<Integer, List<ContextWindow>> contextLookup;//set of contexts in which we should count term frequencies
/**
* @param feature
* @param properties
* @param docIds
* @param allCandidates
* @param solrIndexSearcher
* @param contextLookup set of contexts in which we should count term frequencies. key:docid+","+sentenceid;
* value: MWEMetadata objects found in that doc and sentence pair. If the contexts
* should be generated, used null or an empty map
* @param window
* @param maxTasksPerWorker
*/
public FrequencyCtxWindowBasedFBWorker(FrequencyCtxBased feature, JATEProperties properties,
List<Integer> docIds,
Set<String> allCandidates,
SolrIndexSearcher solrIndexSearcher,
Map<Integer, List<ContextWindow>> contextLookup,
int window,
int maxTasksPerWorker) {
super(docIds, maxTasksPerWorker);
this.properties = properties;
this.solrIndexSearcher = solrIndexSearcher;
this.allCandidates = allCandidates;
this.feature = feature;
this.window = window;
this.contextLookup = contextLookup;
}
@Override
protected JATERecursiveTaskWorker<Integer, Integer> createInstance(List<Integer> docIdSplit) {
return new FrequencyCtxWindowBasedFBWorker(feature, properties, docIdSplit,
allCandidates,
solrIndexSearcher,
contextLookup,
window, maxTasksPerThread);
}
@Override
protected Integer mergeResult(List<JATERecursiveTaskWorker<Integer, Integer>> jateRecursiveTaskWorkers) {
Integer total = 0;
for (JATERecursiveTaskWorker<Integer, Integer> worker : jateRecursiveTaskWorkers) {
total += worker.join();
}
return total;
}
@Override
protected Integer computeSingleWorker(List<Integer> docIds) {
LOG.info("Total docs to process=" + docIds.size());
if (contextLookup == null || contextLookup.size() == 0)
return generateNewContexts(docIds);
else {
return useExistingContexts(docIds);
}
}
/**
* Use existing context windows to count term/word frequency within contexts.
* MWEMetadata overlap zones are generated for adjacent context windows.
*
* @param docIds
* @return
*/
private int useExistingContexts(List<Integer> docIds) {
int count = 0;
//Set<Integer> firstTokenIndexes = new HashSet<>();
for (int docId : docIds) {
count++;
try {
//get the terms in this document, ordered by sentence id, then its index position in sentence
Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);
List<MWEInSentence> terms = collectTermSentenceContext(
lookupVector, new HashMap<>());
List<ContextWindow> contexts_in_doc = contextLookup.get(docId);
if (contexts_in_doc == null || contexts_in_doc.size() == 0)
continue;
//context windows are now should be sorted by sentence id, then start tok index, then end tok index
Collections.sort(contexts_in_doc);
//mwecontext also sorted by sentence id, then start tok index, then end tok index
int cursor = 0;//cursor to point to the position in the list of terms that have been processed
ContextWindow prevCtx = null;
//go thru each context window, compute term frequency within that window
for (ContextWindow ctx : contexts_in_doc) {
ContextOverlap co = null;
if (prevCtx != null && prevCtx.getSentenceId() == ctx.getSentenceId()) {//does current context overlap with previous
//calculate context overlap
if (prevCtx.getLastTok() >= ctx.getFirstTok()) {
co = new ContextOverlap(prevCtx, ctx, new ArrayList<>());
}
}
/*if(ctx.getDocId()==399&&ctx.getSentenceId()==1&&ctx.getFirstTok()==13&&ctx.getLastTok()==23)
System.out.println("stop");*/
int indexFirstIncludedTermByContext = -1;
for (int i = cursor; i < terms.size(); i++) {//starting from the term pointed by the cursor
//window to check
MWEInSentence t = terms.get(i);
if (ctx.getSentenceId() < t.sentenceId) {//term is in the next sentence to the context's containing sentence
//no terms will be found in the current context, so move on to the next context
cursor = indexFirstIncludedTermByContext;
break;
} else if (ctx.getSentenceId() > t.sentenceId) {//term is in the previous sentence to the context's sentence
//should move on to the first term that is in the same sentence of the context
continue;
}
//term is in the same context of the sentence; next, check is t within this context?
boolean outOfContext = false;
if ((t.firstTokenIndex >= ctx.getFirstTok() && t.firstTokenIndex <= ctx.getLastTok()) ||
t.lastTokenIndex >= ctx.getFirstTok() && t.lastTokenIndex <= ctx.getLastTok()) { //containment can be partial
feature.increment(ctx, 1);
feature.increment(ctx, t.string, 1);
if (indexFirstIncludedTermByContext == -1)
indexFirstIncludedTermByContext = i;
} else if (t.lastTokenIndex < ctx.getFirstTok()) { //term to the left of the context, continue the term list to search
//for term included in this context
continue;
} else {
outOfContext = true;
if (indexFirstIncludedTermByContext != -1)
cursor = indexFirstIncludedTermByContext;
}
//is t within a context overlap?
if (co != null) {
if ((co.getPrevContext().getLastTok() >= t.firstTokenIndex &&
co.getNextContext().getFirstTok() <= t.firstTokenIndex) ||
(co.getPrevContext().getLastTok() >= t.lastTokenIndex &&
co.getNextContext().getFirstTok() <= t.lastTokenIndex) ||
(co.getPrevContext().getFirstTok() <= t.firstTokenIndex && co.getPrevContext().getLastTok() >= t.firstTokenIndex &&
co.getNextContext().getLastTok() >= t.lastTokenIndex && co.getNextContext().getFirstTok() <= t.lastTokenIndex)) {
co.getTerms().add(t.string);
}
}
if (outOfContext)
break;
}
prevCtx = ctx;
if (co != null && co.getTerms().size() > 0)
feature.addCtxOverlapZone(co);
}
} catch (IOException | JATEException ioe) {
StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
sb.append(docId).append("\n");
sb.append(ExceptionUtils.getFullStackTrace(ioe));
LOG.error(sb.toString());
}
}
//LOG.info("debug---finished");
return count;
}
private int generateNewContexts(List<Integer> docIds) {
int count = 0;
Set<Integer> firstTokenIndexes = new HashSet<>();
for (int docId : docIds) {
count++;
try {
//get all terms in the document
Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);
Map<Integer, Integer> sentenceBoundaries = new HashMap<>();
//terms are now sorted by sentence id, then first tok index in sentence, then last tok index in sentence
List<MWEInSentence> terms = collectTermSentenceContext(
lookupVector, sentenceBoundaries);
int lastToken = -1;
int currSentenceId = -1, currWindowStart = -1, currWindowEnd = -1;
ContextWindow prevCtx = null;
List<Integer> prevWindowRight = new ArrayList<>(); //to keep indexes of terms that appear on the right half
// of the window context
for (int i = 0; i < terms.size(); i++) {
MWEInSentence term = terms.get(i);
firstTokenIndexes.add(term.firstTokenIndex);
//init for a sentence
if (currSentenceId == -1 || (currSentenceId != -1 && term.sentenceId != currSentenceId)) {//if new sentence, reset window parameters
currSentenceId = term.sentenceId;
currWindowStart = -1;
currWindowEnd = -1;
lastToken = sentenceBoundaries.get(currSentenceId);
}
if (term.firstTokenIndex >= currWindowStart && term.firstTokenIndex <= currWindowEnd)
continue;//the term is included in the current window, it should have been counted
//create window based on this term, and check its context
currWindowStart = term.firstTokenIndex - window;
if (currWindowStart < 0)
currWindowStart = 0;
currWindowEnd = term.lastTokenIndex + window;
if (currWindowEnd >= lastToken)
currWindowEnd = lastToken;
/*if (currWindowStart > currWindowEnd)
System.out.println();*/
ContextWindow ctx = new ContextWindow();
ctx.setDocId(docId);
ctx.setSentenceId(currSentenceId);
ctx.setFirstTok(currWindowStart);
ctx.setLastTok(currWindowEnd);
/*if (docId == 399 && currSentenceId == 1 && currWindowStart == 5 && currWindowEnd == 17)
System.out.println("stop");
if (docId == 399 && currSentenceId == 1 && currWindowStart == 13 && currWindowEnd == 23)
System.out.println("stop");*/
feature.increment(ctx, 1);
feature.increment(ctx, term.string, 1);
//previous j tokens
List<String> termsInOverlap = new ArrayList<>();
List<Integer> currentWindowRight=new ArrayList<>();
for (int j = i - 1; j > -1; j--) {
MWEInSentence prevTerm = terms.get(j);
if (prevWindowRight.size() > 0) { //if we have moved back passing the the leftmost term in the
//// previous window's right half, stop. This is to ensure minimum overlap
if (j < prevWindowRight.get(0))
break;
} else if (prevTerm.lastTokenIndex < currWindowStart || prevTerm.sentenceId != ctx.getSentenceId())
break;
if ((prevTerm.firstTokenIndex >= ctx.getFirstTok() && prevTerm.firstTokenIndex <= ctx.getLastTok()) ||
(prevTerm.lastTokenIndex >= ctx.getFirstTok() && prevTerm.lastTokenIndex <= ctx.getLastTok())) {
feature.increment(ctx, 1);
feature.increment(ctx, prevTerm.string, 1);
if (prevWindowRight.contains(j)) {
//if any term in the left half of current term's context window is also found in
//the previous term's right half of context window, they are in overlap
termsInOverlap.add(prevTerm.string);
}
}
if(prevTerm.sentenceId==term.sentenceId&&prevTerm.lastTokenIndex>term.lastTokenIndex) //update terms that appear in the right half of
//current term's context. A term appearing to the left of the current term can span
//across the current term to finish on the right of the current term
currentWindowRight.add(j);
}
if (prevCtx != null && prevCtx.getSentenceId()==ctx.getSentenceId()&&
termsInOverlap.size() > 0 && prevCtx.getLastTok() >= ctx.getFirstTok()) {
ContextOverlap co = new ContextOverlap(prevCtx, ctx, termsInOverlap);
feature.addCtxOverlapZone(co);
}
//following j tokens
for (int j = i + 1; j < terms.size(); j++) {
i = j - 1;
MWEInSentence nextTerm = terms.get(j);
if (nextTerm.firstTokenIndex > currWindowEnd || nextTerm.sentenceId != ctx.getSentenceId())
break;
feature.increment(ctx, 1);
feature.increment(ctx, nextTerm.string, 1);
currentWindowRight.add(j);////update terms that appear in the right half of current term's context
}
prevWindowRight=currentWindowRight;
prevCtx = ctx;
}
} catch (IOException ioe) {
StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
sb.append(docId).append("\n");
sb.append(ExceptionUtils.getFullStackTrace(ioe));
LOG.error(sb.toString());
} catch (JATEException je) {
StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
sb.append(docId).append("\n");
sb.append(ExceptionUtils.getFullStackTrace(je));
LOG.error(sb.toString());
}
}
if (firstTokenIndexes.size() / docIds.size() <= 1)
try {
LOG.warn("Check your analyzer chain for your Solr field "
+ properties.getSolrFieldNameJATENGramInfo() + " if each token's position in a sentence has been produced.");
} catch (JATEException e) {
}
//LOG.info("debug---finished");
return count;
}
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
Map<Integer, Integer> sentenceBoundaries) throws IOException {
List<MWEInSentence> result = new ArrayList<>();
TermsEnum tiRef = termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
if (!allCandidates.contains(tString)) {
luceneTerm = tiRef.next();
continue;
}
PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
//PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);
int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
if (doc != PostingsEnum.NO_MORE_DOCS) {
int totalOccurrence = postingsEnum.freq();
for (int i = 0; i < totalOccurrence; i++) {
postingsEnum.nextPosition();
int start = postingsEnum.startOffset();
int end = postingsEnum.endOffset();
BytesRef payload = postingsEnum.getPayload();
SentenceContext sentenceContextInfo = null;
if (payload != null) {
sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
}
if (sentenceContextInfo == null)
result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
else {
result.add(new MWEInSentence(tString, start, end,
sentenceContextInfo.getFirstTokenIdx(),
sentenceContextInfo.getLastTokenIdx(),
sentenceContextInfo.getSentenceId()));
Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
sentenceContextInfo.getLastTokenIdx());
}
}
}
luceneTerm = tiRef.next();
}
Collections.sort(result);
return result;
}
private class MWEInSentence implements Comparable<MWEInSentence> {
public String string;
public int sentenceId;
public int firstTokenIndex;
public int lastTokenIndex;
public int start;
public int end;
public MWEInSentence(String string, int start, int end,
int firstTokenIndex, int lastTokenIndex, int sentenceId) {
this.string = string;
this.sentenceId = sentenceId;
this.start = start;
this.end = end;
this.firstTokenIndex = firstTokenIndex;
this.lastTokenIndex = lastTokenIndex;
}
@Override
public int compareTo(MWEInSentence o) {
int compare = Integer.valueOf(sentenceId).compareTo(o.sentenceId);
if(compare==0)
compare=Integer.valueOf(firstTokenIndex).compareTo(o.firstTokenIndex);
if (compare == 0) {
return Integer.valueOf(lastTokenIndex).compareTo(o.lastTokenIndex);
}
return compare;
}
public String toString() {
return "st=" + sentenceId + ",f=" + firstTokenIndex + ",l=" + lastTokenIndex + ",so=" + start + ",se=" + end;
}
}
}