FrequencyCtxWindowBasedFBWorker.java example

Explorer

jat-master
- jate-master
  - src
    - main
      - java
        org
        apache
        lucene
        analysis
        jate
        ComplexShingleFilter.java
        ComplexShingleFilterFactory.java
        EnglishLemmatisationFilter.java
        EnglishLemmatisationFilterFactory.java
        MWEFeatureFilter.java
        MWEFeatureFilterFactory.java
        MWEFilter.java
        MWEFilterFactory.java
        MWEMetadata.java
        MWEMetadataType.java
        OpenNLPMWEFilter.java
        OpenNLPNounPhraseFilter.java
        OpenNLPNounPhraseFilterFactory.java
        OpenNLPPOSTaggerFactory.java
        OpenNLPPOSTaggerFilter.java
        OpenNLPRegexChunker.java
        OpenNLPRegexChunkerFactory.java
        OpenNLPTokenizer.java
        OpenNLPTokenizerFactory.java
        Paragraph.java
        ParagraphChunker.java
        ParagraphChunkerRawText.java
        PunctuationRemover.java
        PunctuationRemoverFactory.java
        SentenceContext.java
        SentenceContextAware.java
        WordShapeTagger.java
        uk
        ac
        shef
        dcs
        jate
        JATEException.java
        JATEProperties.java
        JATERecursiveTaskWorker.java
        algorithm
        ATTF.java
        Algorithm.java
        CValue.java
        CValueWorker.java
        ChiSquare.java
        ChiSquareWorker.java
        GlossEx.java
        RAKE.java
        RAKEWorker.java
        RIDF.java
        ReferenceBased.java
        TFIDF.java
        TTF.java
        TermEx.java
        TermInfoCollector.java
        Weirdness.java
        app
        App.java
        AppATTF.java
        AppCValue.java
        AppChiSquare.java
        AppGlossEx.java
        AppParams.java
        AppRAKE.java
        AppRIDF.java
        AppTFIDF.java
        AppTTF.java
        AppTermEx.java
        AppWeirdness.java
        Indexing.java
        Voting.java
        eval
        ATEResultLoader.java
        GSLoader.java
        Scorer.java
        feature
        AbstractFeature.java
        AbstractFeatureBuilder.java
        ChiSquareFrequentTerms.java
        ChiSquareFrequentTermsFBMaster.java
        ChiSquareFrequentTermsFBWorker.java
        Containment.java
        ContainmentFBMaster.java
        ContainmentFBWorker.java
        ContextOverlap.java
        ContextWindow.java
        Cooccurrence.java
        CooccurrenceFBMaster.java
        CooccurrenceFBWorker.java
        FrequencyCtxBased.java
        FrequencyCtxBasedCopier.java
        FrequencyCtxDocBasedFBMaster.java
        FrequencyCtxDocBasedFBWorker.java
        FrequencyCtxSentenceBasedFBMaster.java
        FrequencyCtxSentenceBasedFBWorker.java
        FrequencyCtxWindowBasedFBMaster.java
        FrequencyCtxWindowBasedFBWorker.java
        FrequencyTermBased.java
        FrequencyTermBasedFBMaster.java
        FrequencyTermBasedFBWorker.java
        PositionFeature.java
        PositionFeatureMaster.java
        PositionFeatureWorker.java
        TTFReferenceFeatureFileBuilder.java
        TermComponentIndex.java
        TermComponentIndexFBMaster.java
        TermComponentIndexFBWorker.java
        WordShapeFBMaster.java
        WordShapeFBWorker.java
        WordShapeFeature.java
        indexing
        IndexingHandler.java
        io
        CSVFileOutputReader.java
        ContentExtractor.java
        DocumentCreator.java
        FileBasedOutputWriter.java
        FileOutputReader.java
        JSONFileOutputReader.java
        TikaMultiFieldDocumentCreator.java
        TikaSimpleDocumentCreator.java
        model
        JATEDocument.java
        JATETerm.java
        TermInfo.java
        nlp
        Chunker.java
        InstanceCreator.java
        Lemmatiser.java
        POSTagger.java
        SentenceSplitter.java
        opennlp
        ChunkerOpenNLP.java
        POSTaggerOpenNLP.java
        SentenceSplitterOpenNLP.java
        solr
        ATTFProcessor.java
        CValueProcessor.java
        ChiSquareProcessor.java
        CompositeTermRecognitionProcessor.java
        GlossExProcessor.java
        RAKEProcessor.java
        RIDFProcessor.java
        TFIDFProcessor.java
        TTFProcessor.java
        TermExProcessor.java
        TermRecognitionProcessor.java
        TermRecognitionProcessorFactory.java
        TermRecognitionRequestHandler.java
        WeirdnessProcessor.java
        util
        ACLRDCorpusParser.java
        GENIACorpusParser.java
        IOUtil.java
        JATEUtil.java
        RegressionFeatureGenerator.java
        ScienceIECorpusParser.java
        SolrUtil.java
    - test
      - java
        uk
        ac
        shef
        dcs
        jate
        app
        ACLRDTECTest.java
        AppATEACLRDTECTest.java
        AppATEGENIATest.java
        BaseEmbeddedSolrTest.java
        TestRAKE.java
        nlp
        opennlp
        SentenceSplitterOpenNLPTest.java
        util
        JATEUtilTest.java

package uk.ac.shef.dcs.jate.feature;

import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.lucene.analysis.jate.MWEMetadata;
import org.apache.lucene.analysis.jate.SentenceContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.search.SolrIndexSearcher;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.JATERecursiveTaskWorker;
import uk.ac.shef.dcs.jate.util.SolrUtil;

import java.io.IOException;
import java.util.*;
import org.apache.log4j.Logger;

/**
 * A containment relationship between a candidate term and a context window can be partial. I.e., as long as a candidate term's
 * start of end token is included in the context window the candidate term is considered to be 'contained' by the context window.
 *
 * Frequencies in context can be calculated in two different modes.
 *
 * </p><b>Mode 1</b>: context windows are generated based on candidate terms. The method these are generated is described
 * in FrequencyCtxWindowBasedFBMaster.
 *
 * </p><Mode 2</b>: context windows are provided (possibly generated by another process already), and the goal is to use
 * these windows as-is and count candidate term frequencies in these windows. One scenario that this mode is useful
 * is when you want to compare the co-occurrences of candidate terms (e.g., phrases) with 'reference' words (e.g., adjectives),
 * such as in the case of NC-value (Frantzi 2000). In this case you want to generate context windows based on candidate terms,
 * but are also interested in what reference words appear in which candidate term contexts (and you do not want to
 * generate context windows around reference words, as they will be different).
 *
 *
 * </p>Reference: Katerina Frantzi, Sophia Ananiadou, Hideki Mima. 2000. Automatic recognition of multi-word terms:
 * the C-value/NC-value method. Natural Language Processing For Digital Libraries International Journal on Digital
 * Libraries August 2000, Volume 3, Issue 2, pp 115-130
 */
class FrequencyCtxWindowBasedFBWorker extends JATERecursiveTaskWorker<Integer, Integer> {
    private static final long serialVersionUID = -9172128488678036089L;
    private static final Logger LOG = Logger.getLogger(FrequencyCtxWindowBasedFBWorker.class.getName());
    private JATEProperties properties;
    private SolrIndexSearcher solrIndexSearcher;
    private Set<String> allCandidates;
    private FrequencyCtxBased feature;
    private int window;
    private Map<Integer, List<ContextWindow>> contextLookup;//set of contexts in which we should count term frequencies

    /**
     * @param feature
     * @param properties
     * @param docIds
     * @param allCandidates
     * @param solrIndexSearcher
     * @param contextLookup     set of contexts in which we should count term frequencies. key:docid+","+sentenceid;
     *                          value: MWEMetadata objects found in that doc and sentence pair. If the contexts
     *                          should be generated, used null or an empty map
     * @param window
     * @param maxTasksPerWorker
     */
    public FrequencyCtxWindowBasedFBWorker(FrequencyCtxBased feature, JATEProperties properties,
                                           List<Integer> docIds,
                                           Set<String> allCandidates,
                                           SolrIndexSearcher solrIndexSearcher,
                                           Map<Integer, List<ContextWindow>> contextLookup,
                                           int window,
                                           int maxTasksPerWorker) {
        super(docIds, maxTasksPerWorker);
        this.properties = properties;
        this.solrIndexSearcher = solrIndexSearcher;
        this.allCandidates = allCandidates;
        this.feature = feature;
        this.window = window;
        this.contextLookup = contextLookup;
    }

    @Override
    protected JATERecursiveTaskWorker<Integer, Integer> createInstance(List<Integer> docIdSplit) {
        return new FrequencyCtxWindowBasedFBWorker(feature, properties, docIdSplit,
                allCandidates,
                solrIndexSearcher,
                contextLookup,
                window, maxTasksPerThread);
    }

    @Override
    protected Integer mergeResult(List<JATERecursiveTaskWorker<Integer, Integer>> jateRecursiveTaskWorkers) {
        Integer total = 0;
        for (JATERecursiveTaskWorker<Integer, Integer> worker : jateRecursiveTaskWorkers) {
            total += worker.join();
        }
        return total;
    }

    @Override
    protected Integer computeSingleWorker(List<Integer> docIds) {
        LOG.info("Total docs to process=" + docIds.size());
        if (contextLookup == null || contextLookup.size() == 0)
            return generateNewContexts(docIds);
        else {
            return useExistingContexts(docIds);
        }
    }

    /**
     * Use existing context windows to count term/word frequency within contexts.
     * MWEMetadata overlap zones are generated for adjacent context windows.
     *
     * @param docIds
     * @return
     */
    private int useExistingContexts(List<Integer> docIds) {
        int count = 0;
        //Set<Integer> firstTokenIndexes = new HashSet<>();
        for (int docId : docIds) {
            count++;
            try {
                //get the terms in this document, ordered by sentence id, then its index position in sentence
                Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);
                List<MWEInSentence> terms = collectTermSentenceContext(
                        lookupVector, new HashMap<>());
                List<ContextWindow> contexts_in_doc = contextLookup.get(docId);
                if (contexts_in_doc == null || contexts_in_doc.size() == 0)
                    continue;

                //context windows are now should be sorted by sentence id, then start tok index, then end tok index
                Collections.sort(contexts_in_doc);
                //mwecontext also sorted by sentence id, then start tok index, then end tok index
                int cursor = 0;//cursor to point to the position in the list of terms that have been processed
                ContextWindow prevCtx = null;
                //go thru each context window, compute term frequency within that window
                for (ContextWindow ctx : contexts_in_doc) {
                    ContextOverlap co = null;
                    if (prevCtx != null && prevCtx.getSentenceId() == ctx.getSentenceId()) {//does current context overlap with previous
                        //calculate context overlap
                        if (prevCtx.getLastTok() >= ctx.getFirstTok()) {
                            co = new ContextOverlap(prevCtx, ctx, new ArrayList<>());
                        }
                    }

                    /*if(ctx.getDocId()==399&&ctx.getSentenceId()==1&&ctx.getFirstTok()==13&&ctx.getLastTok()==23)
                        System.out.println("stop");*/

                    int indexFirstIncludedTermByContext = -1;
                    for (int i = cursor; i < terms.size(); i++) {//starting from the term pointed by the cursor
                        //window to check
                        MWEInSentence t = terms.get(i);

                        if (ctx.getSentenceId() < t.sentenceId) {//term is in the next sentence to the context's containing sentence
                            //no terms will be found in the current context, so move on to the next context
                            cursor = indexFirstIncludedTermByContext;
                            break;
                        } else if (ctx.getSentenceId() > t.sentenceId) {//term is in the previous sentence to the context's sentence
                            //should move on to the first term that is in the same sentence of the context
                            continue;
                        }

                        //term is in the same context of the sentence; next, check is t within this context?
                        boolean outOfContext = false;
                        if ((t.firstTokenIndex >= ctx.getFirstTok() && t.firstTokenIndex <= ctx.getLastTok()) ||
                                t.lastTokenIndex >= ctx.getFirstTok() && t.lastTokenIndex <= ctx.getLastTok()) { //containment can be partial
                            feature.increment(ctx, 1);
                            feature.increment(ctx, t.string, 1);
                            if (indexFirstIncludedTermByContext == -1)
                                indexFirstIncludedTermByContext = i;
                        } else if (t.lastTokenIndex < ctx.getFirstTok()) { //term to the left of the context, continue the term list to search
                            //for term included in this context
                            continue;
                        } else {
                            outOfContext = true;
                            if (indexFirstIncludedTermByContext != -1)
                                cursor = indexFirstIncludedTermByContext;
                        }

                        //is t within a context overlap?
                        if (co != null) {
                            if ((co.getPrevContext().getLastTok() >= t.firstTokenIndex &&
                                    co.getNextContext().getFirstTok() <= t.firstTokenIndex) ||
                                    (co.getPrevContext().getLastTok() >= t.lastTokenIndex &&
                                            co.getNextContext().getFirstTok() <= t.lastTokenIndex) ||
                                    (co.getPrevContext().getFirstTok() <= t.firstTokenIndex && co.getPrevContext().getLastTok() >= t.firstTokenIndex &&
                                            co.getNextContext().getLastTok() >= t.lastTokenIndex && co.getNextContext().getFirstTok() <= t.lastTokenIndex)) {
                                co.getTerms().add(t.string);
                            }
                        }

                        if (outOfContext)
                            break;

                    }

                    prevCtx = ctx;

                    if (co != null && co.getTerms().size() > 0)
                        feature.addCtxOverlapZone(co);
                }
            } catch (IOException | JATEException ioe) {
                StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
                sb.append(docId).append("\n");
                sb.append(ExceptionUtils.getFullStackTrace(ioe));
                LOG.error(sb.toString());
            }
        }

        //LOG.info("debug---finished");
        return count;
    }

    private int generateNewContexts(List<Integer> docIds) {
        int count = 0;
        Set<Integer> firstTokenIndexes = new HashSet<>();
        for (int docId : docIds) {
            count++;
            try {
                //get all terms in the document
                Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);
                Map<Integer, Integer> sentenceBoundaries = new HashMap<>();
                //terms are now sorted by sentence id, then first tok index in sentence, then last tok index in sentence
                List<MWEInSentence> terms = collectTermSentenceContext(
                        lookupVector, sentenceBoundaries);

                int lastToken = -1;

                int currSentenceId = -1, currWindowStart = -1, currWindowEnd = -1;
                ContextWindow prevCtx = null;
                List<Integer> prevWindowRight = new ArrayList<>(); //to keep indexes of terms that appear on the right half
                // of the window context

                for (int i = 0; i < terms.size(); i++) {
                    MWEInSentence term = terms.get(i);
                    firstTokenIndexes.add(term.firstTokenIndex);

                    //init for a sentence
                    if (currSentenceId == -1 || (currSentenceId != -1 && term.sentenceId != currSentenceId)) {//if new sentence, reset window parameters
                        currSentenceId = term.sentenceId;
                        currWindowStart = -1;
                        currWindowEnd = -1;
                        lastToken = sentenceBoundaries.get(currSentenceId);
                    }

                    if (term.firstTokenIndex >= currWindowStart && term.firstTokenIndex <= currWindowEnd)
                        continue;//the term is included in the current window, it should have been counted

                    //create window based on this term, and check its context
                    currWindowStart = term.firstTokenIndex - window;
                    if (currWindowStart < 0)
                        currWindowStart = 0;
                    currWindowEnd = term.lastTokenIndex + window;
                    if (currWindowEnd >= lastToken)
                        currWindowEnd = lastToken;

                    /*if (currWindowStart > currWindowEnd)
                        System.out.println();*/

                    ContextWindow ctx = new ContextWindow();
                    ctx.setDocId(docId);
                    ctx.setSentenceId(currSentenceId);
                    ctx.setFirstTok(currWindowStart);
                    ctx.setLastTok(currWindowEnd);

                    /*if (docId == 399 && currSentenceId == 1 && currWindowStart == 5 && currWindowEnd == 17)
                        System.out.println("stop");
                    if (docId == 399 && currSentenceId == 1 && currWindowStart == 13 && currWindowEnd == 23)
                        System.out.println("stop");*/

                    feature.increment(ctx, 1);
                    feature.increment(ctx, term.string, 1);

                    //previous j tokens
                    List<String> termsInOverlap = new ArrayList<>();

                    List<Integer> currentWindowRight=new ArrayList<>();
                    for (int j = i - 1; j > -1; j--) {
                        MWEInSentence prevTerm = terms.get(j);
                        if (prevWindowRight.size() > 0) { //if we have moved back passing the the leftmost term in the
                            //// previous window's right half, stop. This is to ensure minimum overlap
                            if (j < prevWindowRight.get(0))
                                break;
                        } else if (prevTerm.lastTokenIndex < currWindowStart || prevTerm.sentenceId != ctx.getSentenceId())
                            break;

                        if ((prevTerm.firstTokenIndex >= ctx.getFirstTok() && prevTerm.firstTokenIndex <= ctx.getLastTok()) ||
                                (prevTerm.lastTokenIndex >= ctx.getFirstTok() && prevTerm.lastTokenIndex <= ctx.getLastTok())) {
                            feature.increment(ctx, 1);
                            feature.increment(ctx, prevTerm.string, 1);
                            if (prevWindowRight.contains(j)) {
                                //if any term in the left half of current term's context window is also found in
                                //the previous term's right half of context window, they are in overlap
                                termsInOverlap.add(prevTerm.string);
                            }
                        }
                        if(prevTerm.sentenceId==term.sentenceId&&prevTerm.lastTokenIndex>term.lastTokenIndex) //update terms that appear in the right half of
                            //current term's context. A term appearing to the left of the current term can span
                            //across the current term to finish on the right of the current term
                            currentWindowRight.add(j);

                    }
                    if (prevCtx != null && prevCtx.getSentenceId()==ctx.getSentenceId()&&
                            termsInOverlap.size() > 0 && prevCtx.getLastTok() >= ctx.getFirstTok()) {
                        ContextOverlap co = new ContextOverlap(prevCtx, ctx, termsInOverlap);
                        feature.addCtxOverlapZone(co);
                    }

                    //following j tokens
                    for (int j = i + 1; j < terms.size(); j++) {
                        i = j - 1;
                        MWEInSentence nextTerm = terms.get(j);
                        if (nextTerm.firstTokenIndex > currWindowEnd || nextTerm.sentenceId != ctx.getSentenceId())
                            break;
                        feature.increment(ctx, 1);
                        feature.increment(ctx, nextTerm.string, 1);
                        currentWindowRight.add(j);////update terms that appear in the right half of current term's context
                    }
                    prevWindowRight=currentWindowRight;

                    prevCtx = ctx;
                }
            } catch (IOException ioe) {
                StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
                sb.append(docId).append("\n");
                sb.append(ExceptionUtils.getFullStackTrace(ioe));
                LOG.error(sb.toString());
            } catch (JATEException je) {
                StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
                sb.append(docId).append("\n");
                sb.append(ExceptionUtils.getFullStackTrace(je));
                LOG.error(sb.toString());
            }
        }
        if (firstTokenIndexes.size() / docIds.size() <= 1)
            try {
                LOG.warn("Check your analyzer chain for your Solr field "
                        + properties.getSolrFieldNameJATENGramInfo() + " if each token's position in a sentence has been produced.");
            } catch (JATEException e) {
            }
        //LOG.info("debug---finished");
        return count;
    }

    private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                                Map<Integer, Integer> sentenceBoundaries) throws IOException {
        List<MWEInSentence> result = new ArrayList<>();

        TermsEnum tiRef = termVectorLookup.iterator();
        BytesRef luceneTerm = tiRef.next();
        while (luceneTerm != null) {
            if (luceneTerm.length == 0) {
                luceneTerm = tiRef.next();
                continue;
            }
            String tString = luceneTerm.utf8ToString();
            if (!allCandidates.contains(tString)) {
                luceneTerm = tiRef.next();
                continue;
            }


            PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
            //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

            int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
            if (doc != PostingsEnum.NO_MORE_DOCS) {
                int totalOccurrence = postingsEnum.freq();
                for (int i = 0; i < totalOccurrence; i++) {
                    postingsEnum.nextPosition();
                    int start = postingsEnum.startOffset();
                    int end = postingsEnum.endOffset();
                    BytesRef payload = postingsEnum.getPayload();
                    SentenceContext sentenceContextInfo = null;
                    if (payload != null) {
                        sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                    }
                    if (sentenceContextInfo == null)
                        result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                    else {
                        result.add(new MWEInSentence(tString, start, end,
                                sentenceContextInfo.getFirstTokenIdx(),
                                sentenceContextInfo.getLastTokenIdx(),
                                sentenceContextInfo.getSentenceId()));

                        Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                        if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                            sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                    sentenceContextInfo.getLastTokenIdx());
                    }
                }
            }
            luceneTerm = tiRef.next();
        }
        Collections.sort(result);
        return result;
    }

    private class MWEInSentence implements Comparable<MWEInSentence> {
        public String string;
        public int sentenceId;
        public int firstTokenIndex;
        public int lastTokenIndex;
        public int start;
        public int end;

        public MWEInSentence(String string, int start, int end,
                             int firstTokenIndex, int lastTokenIndex, int sentenceId) {
            this.string = string;
            this.sentenceId = sentenceId;
            this.start = start;
            this.end = end;
            this.firstTokenIndex = firstTokenIndex;
            this.lastTokenIndex = lastTokenIndex;
        }

        @Override
        public int compareTo(MWEInSentence o) {
            int compare = Integer.valueOf(sentenceId).compareTo(o.sentenceId);
            if(compare==0)
                compare=Integer.valueOf(firstTokenIndex).compareTo(o.firstTokenIndex);
            if (compare == 0) {
                return Integer.valueOf(lastTokenIndex).compareTo(o.lastTokenIndex);
            }
            return compare;
        }

        public String toString() {

            return "st=" + sentenceId + ",f=" + firstTokenIndex + ",l=" + lastTokenIndex + ",so=" + start + ",se=" + end;
        }
    }
}