ApplyPatternsMulti.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.patterns.surface;

import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.Callable;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.MultiPatternMatcher;
import edu.stanford.nlp.ling.tokensregex.SequenceMatchResult;
import edu.stanford.nlp.ling.tokensregex.SequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.patterns.*;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Triple;


/** @author Sonal Gupta */
public class ApplyPatternsMulti<E extends Pattern> implements Callable<Pair<TwoDimensionalCounter<Pair<String, String>, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>>> {

  private final String label;
  private final Map<TokenSequencePattern, E> patterns;
  private final List<String> sentids;
  private final boolean removeStopWordsFromSelectedPhrases;
  private final boolean removePhrasesWithStopWords;
  private final ConstantsAndVariables constVars;
  //Set<String> ignoreWords;
  private final MultiPatternMatcher<CoreMap> multiPatternMatcher;
  private final Map<String, DataInstance> sents;

  public ApplyPatternsMulti(Map<String, DataInstance> sents, List<String> sentids, Map<TokenSequencePattern, E> patterns, String label, boolean removeStopWordsFromSelectedPhrases, boolean removePhrasesWithStopWords, ConstantsAndVariables cv) {
    this.sents = sents;
    this.patterns = patterns;
    multiPatternMatcher = TokenSequencePattern.getMultiPatternMatcher(patterns.keySet());
    this.sentids = sentids;
    this.label = label;
    this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases;
    this.removePhrasesWithStopWords = removePhrasesWithStopWords;
    this.constVars = cv;
  }

  @Override
  public Pair<TwoDimensionalCounter<Pair<String, String>, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception {

    //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
    CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();

    TwoDimensionalCounter<Pair<String, String>, E> allFreq = new TwoDimensionalCounter<>();
    for (String sentid : sentids) {
      List<CoreLabel> sent = sents.get(sentid).getTokens();

      //FIND_ALL is faster than FIND_NONOVERLAP
      Iterable<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.find(sent, SequenceMatcher.FindType.FIND_ALL);

      for (SequenceMatchResult<CoreMap> m: matched) {
        int s = m.start("$term");
        int e = m.end("$term");
        E matchedPat = patterns.get(m.pattern());
        matchedTokensByPat.add(matchedPat, new Triple<>(sentid, s, e));
        String phrase = "";
        String phraseLemma = "";
        boolean useWordNotLabeled = false;
        boolean doNotUse = false;

        //find if the neighboring words are labeled - if so - club them together
        if(constVars.clubNeighboringLabeledWords) {
          for (int i = s - 1; i >= 0; i--) {
            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
              s = i + 1;
              break;
            }
          }
          for (int i = e; i < sent.size(); i++) {
            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
              e = i;
              break;
            }
          }
        }

        //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
        boolean[] addedindices = new boolean[e-s];
        // Arrays.fill(addedindices, false); // unneeded as done on initialization

        for (int i = s; i < e; i++) {
          CoreLabel l = sent.get(i);
          l.set(PatternsAnnotations.MatchedPattern.class, true);

          if(!l.containsKey(PatternsAnnotations.MatchedPatterns.class))
            l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
          l.get(PatternsAnnotations.MatchedPatterns.class).add(matchedPat);

          // if (restrictToMatched) {
          // tokensMatchedPattern.add(sentid, i);
          // }
          for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
            if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
              doNotUse = true;
            }
          }
          boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
          if (removePhrasesWithStopWords && containsStop) {
            doNotUse = true;
          } else {
            if (!containsStop || !removeStopWordsFromSelectedPhrases) {

              if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                useWordNotLabeled = true;
              }
              phrase += " " + l.word();
              phraseLemma += " " + l.lemma();
              addedindices[i-s] = true;
            }
          }
        }

        for(int i =0; i < addedindices.length; i++){
          if(i > 0 && i < addedindices.length -1 && addedindices[i-1] == true && addedindices[i] == false && addedindices[i+1] == true){
            doNotUse = true;
            break;
          }
        }

        if (!doNotUse && useWordNotLabeled) {
          phrase = phrase.trim();
          phraseLemma = phraseLemma.trim();

          allFreq.incrementCount(new Pair<>(phrase, phraseLemma), matchedPat, 1.0);
        }
      }

//      for (SurfacePattern pat : patterns.keySet()) {
//        String patternStr = pat.toString();
//
//        TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
//        if (pat == null || p == null)
//          throw new RuntimeException("why is the pattern " + pat + " null?");
//
//        TokenSequenceMatcher m = p.getMatcher(sent);
//        while (m.find()) {
//
//          int s = m.start("$term");
//          int e = m.end("$term");
//
//          String phrase = "";
//          String phraseLemma = "";
//          boolean useWordNotLabeled = false;
//          boolean doNotUse = false;
//          for (int i = s; i < e; i++) {
//            CoreLabel l = sent.get(i);
//            l.set(PatternsAnnotations.MatchedPattern.class, true);
//            if (restrictToMatched) {
//              tokensMatchedPattern.add(sentid, i);
//            }
//            for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
//              if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
//                doNotUse = true;
//              }
//            }
//            boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
//            if (removePhrasesWithStopWords && containsStop) {
//              doNotUse = true;
//            } else {
//              if (!containsStop || !removeStopWordsFromSelectedPhrases) {
//
//                if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
//                  useWordNotLabeled = true;
//                }
//                phrase += " " + l.word();
//                phraseLemma += " " + l.lemma();
//
//              }
//            }
//          }
//          if (!doNotUse && useWordNotLabeled) {
//            phrase = phrase.trim();
//            phraseLemma = phraseLemma.trim();
//            allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
//          }
//        }
//      }
    }

    return new Pair<>(allFreq, matchedTokensByPat);
  }

  private static boolean  containsStopWord(CoreLabel l, Set<String> commonEngWords, java.util.regex.Pattern ignoreWordRegex) {
    // if(useWordResultCache.containsKey(l.word()))
    // return useWordResultCache.get(l.word());

    if ((commonEngWords.contains(l.lemma()) || commonEngWords.contains(l.word())) || (ignoreWordRegex != null && ignoreWordRegex.matcher(l.lemma()).matches())){
        //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) {
      // useWordResultCache.putIfAbsent(l.word(), false);
      return true;
    }
    //
    // if (l.word().length() >= minLen4Fuzzy) {
    // try {
    // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords,
    // l.word(), minLen4Fuzzy);
    // if (matchedFuzzy != null) {
    // synchronized (commonEngWords) {
    // commonEngWords.add(l.word());
    // System.out.println("word is " + l.word() + " and matched fuzzy with " +
    // matchedFuzzy);
    // }
    // useWordResultCache.putIfAbsent(l.word(), false);
    // return false;
    // }
    // } catch (Exception e) {
    // e.printStackTrace();
    // System.out.println("Exception " + " while fuzzy matching " + l.word());
    // }
    // }
    // useWordResultCache.putIfAbsent(l.word(), true);
    return false;
  }

}