ScorePhrases.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.patterns;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObjectBuilder;
import javax.json.JsonReader;
import javax.json.JsonValue;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.patterns.dep.ApplyDepPatterns;
import edu.stanford.nlp.patterns.surface.*;
import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.WordScoring;
import edu.stanford.nlp.patterns.PhraseScorer.Normalization;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.ArgumentParser.Option;
import edu.stanford.nlp.util.logging.Redwood;

public class ScorePhrases<E extends Pattern>  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ScorePhrases.class);

  Map<String, Boolean> writtenInJustification = new HashMap<>();

  ConstantsAndVariables constVars = null;

  @Option(name = "phraseScorerClass")
  Class<? extends PhraseScorer> phraseScorerClass = ScorePhrasesAverageFeatures.class;
  PhraseScorer phraseScorer = null;

  public ScorePhrases(Properties props, ConstantsAndVariables cv){
    ArgumentParser.fillOptions(this, props);
    this.constVars = cv;
    try {
      phraseScorer = phraseScorerClass
          .getConstructor(ConstantsAndVariables.class).newInstance(constVars);
    } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
      throw new RuntimeException(e);
    }
    ArgumentParser.fillOptions(phraseScorer, props);
  }

  public Counter<CandidatePhrase> chooseTopWords(Counter<CandidatePhrase> newdt,
                                                 TwoDimensionalCounter<CandidatePhrase, E> terms,
                                                 Counter<CandidatePhrase> useThresholdNumPatternsForTheseWords,
                                                 Set<CandidatePhrase> ignoreWords, double thresholdWordExtract) {

    Iterator<CandidatePhrase> termIter = Counters.toPriorityQueue(newdt).iterator();
    Counter<CandidatePhrase> finalwords = new ClassicCounter<>();

    while (termIter.hasNext()) {

      if (finalwords.size() >= constVars.numWordsToAdd){
        break;
      }
      CandidatePhrase w = termIter.next();
      if (newdt.getCount(w) < thresholdWordExtract) {
        Redwood.log(ConstantsAndVariables.extremedebug,"not adding word " + w + " and any later words because the score " + newdt.getCount(w) + " is less than the threshold of  " + thresholdWordExtract);
        break;
      }
      assert (newdt.getCount(w) != Double.POSITIVE_INFINITY);
      if (useThresholdNumPatternsForTheseWords.containsKey(w)
          && numNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied) {
        Redwood
            .log(
                "extremePatDebug",
                "Not adding "
                    + w
                    + " because the number of non redundant patterns are below threshold of " +  constVars.thresholdNumPatternsApplied + ":"
                    + terms.getCounter(w).keySet());
        continue;
      }
      CandidatePhrase matchedFuzzy = null;
      if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null)
        matchedFuzzy = ConstantsAndVariables.containsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern);
      if (matchedFuzzy == null) {
        Redwood.log("extremePatDebug", "adding word " + w);
        finalwords.setCount(w, newdt.getCount(w));
      } else {
        Redwood
            .log("extremePatDebug", "not adding " + w
                + " because it matched " + matchedFuzzy
                + " in common English word");
        ignoreWords.add(w);
      }
    }
     String nextTen = "";
     int n = 0;
     while (termIter.hasNext()) {
     n++;
     if (n > 10)
     break;
       CandidatePhrase w = termIter.next();
     nextTen += ";\t" + w + ":" + newdt.getCount(w);
     }
     Redwood.log(Redwood.DBG, "Next ten phrases were " + nextTen);
    return finalwords;
  }

  public static <E, F> void removeKeys(TwoDimensionalCounter<E, F> counter,
      Collection<E> removeKeysCollection) {

    for (E key : removeKeysCollection)
      counter.remove(key);
  }

  private double numNonRedundantPatterns(
      TwoDimensionalCounter<CandidatePhrase, E> terms, CandidatePhrase w) {
    Object[] pats = terms.getCounter(w).keySet()
        .toArray();
    int numPat = 0;
    for (int i = 0; i < pats.length; i++) {
      //String pati = constVars.getPatternIndex().get(pats[i]).toString();
      String pati = pats[i].toString();
      boolean contains = false;
      for (int j = i + 1; j < pats.length; j++) {
        //String patj = constVars.getPatternIndex().get(pats[j]).toString();
        String patj = pats[j].toString();
        if (patj.contains(pati) || pati.contains(patj)) {
          contains = true;
          break;
        }
      }
      if (!contains)
        numPat++;
    }
    return numPat;
  }

  public Counter<CandidatePhrase> learnNewPhrases(
    String label,
    PatternsForEachToken patternsForEachToken,
    Counter<E> patternsLearnedThisIter,
    Counter<E> allSelectedPatterns,
    CollectionValuedMap<E, Triple<String, Integer, Integer>> tokensMatchedPatterns,
    Counter<CandidatePhrase> scoreForAllWordsThisIteration,
    TwoDimensionalCounter<CandidatePhrase, E> terms,
    TwoDimensionalCounter<CandidatePhrase, E> wordsPatExtracted,
    TwoDimensionalCounter<E, CandidatePhrase> patternsAndWords4Label,
    String identifier, Set<CandidatePhrase> ignoreWords) throws IOException, ClassNotFoundException {

    boolean computeProcDataFreq = false;
    if (Data.processedDataFreq == null) {
      computeProcDataFreq = true;
      Data.processedDataFreq = new ClassicCounter<>();
      assert Data.rawFreq != null;
    }

    Set<CandidatePhrase> alreadyIdentifiedWords = new HashSet<>(constVars.getLearnedWords(label).keySet());
    alreadyIdentifiedWords.addAll(constVars.getSeedLabelDictionary().get(label));
    Counter<CandidatePhrase> words = learnNewPhrasesPrivate(label,
        patternsForEachToken, patternsLearnedThisIter, allSelectedPatterns, alreadyIdentifiedWords,
        tokensMatchedPatterns, scoreForAllWordsThisIteration, terms,
        wordsPatExtracted,  patternsAndWords4Label,
        identifier, ignoreWords, computeProcDataFreq);

    //constVars.addLabelDictionary(label, words.keySet());


    return words;
  }

  void runParallelApplyPats(Map<String, DataInstance> sents, String label, E pattern, TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted,
                            CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat, Set<CandidatePhrase> alreadyLabeledWords) {

    Redwood.log(Redwood.DBG, "Applying pattern " + pattern + " to a total of " + sents.size() + " sentences ");
    List<String> notAllowedClasses = new ArrayList<>();
    List<String> sentids = CollectionUtils.toList(sents.keySet());
    if(constVars.doNotExtractPhraseAnyWordLabeledOtherClass){
      for(String l: constVars.getAnswerClass().keySet()){
        if(!l.equals(label)){
          notAllowedClasses.add(l);
        }
      }
      notAllowedClasses.add("OTHERSEM");
    }


    Map<TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null;
    Map<SemgrexPattern, E> depPatternsLearnedThisIterConverted = null;

    if(constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) {
      surfacePatternsLearnedThisIterConverted = new HashMap<>();
      String patternStr = null;
      try{
        patternStr = pattern.toString(notAllowedClasses);
        TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
        surfacePatternsLearnedThisIterConverted.put(pat, pattern);
      }catch(Exception e){
        log.info("Error applying patterrn " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
        throw e;
      }
    }else if(constVars.patternType.equals(PatternFactory.PatternType.DEP)){
      depPatternsLearnedThisIterConverted = new HashMap<>();
      SemgrexPattern pat = SemgrexPattern.compile(pattern.toString(notAllowedClasses), new edu.stanford.nlp.semgraph.semgrex.Env(constVars.env.get(label).getVariables()));
      depPatternsLearnedThisIterConverted.put(pat, pattern);
    } else
    throw new UnsupportedOperationException();

    //Apply the patterns and extract candidate phrases
    int num;
    int numThreads = constVars.numThreads;

    //If number of sentences is less, do not create so many threads
    if(sents.size() < 50)
      numThreads = 1;

    if (numThreads == 1)
      num = sents.size();
    else
      num = sents.size() / (numThreads - 1);

    ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
    List<Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>>> list = new ArrayList<>();


    for (int i = 0; i < numThreads; i++) {

      Callable<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> task = null;

      if(pattern.type.equals(PatternFactory.PatternType.SURFACE))
        //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
        task = new ApplyPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num,
          Math.min(sentids.size(), (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label,
          constVars.removeStopWordsFromSelectedPhrases,
          constVars.removePhrasesWithStopWords, constVars);
      else
        task = new ApplyDepPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num,
          Math.min(sentids.size(), (i + 1) * num)), depPatternsLearnedThisIterConverted, label,
          constVars.removeStopWordsFromSelectedPhrases,
          constVars.removePhrasesWithStopWords, constVars);


      Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> submit = executor
          .submit(task);
      list.add(submit);
    }

    // Now retrieve the result
    for (Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>,Set<CandidatePhrase>>> future : list) {
      try{
        Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>,Set<CandidatePhrase>> result = future
            .get();
        Redwood.log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.first());
        wordsandLemmaPatExtracted.addAll(result.first());
        matchedTokensByPat.addAll(result.second());
        alreadyLabeledWords.addAll(result.third());
      }catch(Exception e){
        executor.shutdownNow();
        throw new RuntimeException(e);
      }
    }
    executor.shutdown();
  }
/*
  void runParallelApplyPats(Map<String, List<CoreLabel>> sents, Set<String> sentIds, String label, Counter<E> patternsLearnedThisIter,  TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted,
                            CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws InterruptedException, ExecutionException{
    List<String> keyset = new ArrayList<String>(sentIds);
    List<String> notAllowedClasses = new ArrayList<String>();

    if(constVars.doNotExtractPhraseAnyWordLabeledOtherClass){
      for(String l: constVars.getAnswerClass().keySet()){
        if(!l.equals(label)){
          notAllowedClasses.add(l+":"+l);
        }
      }
      notAllowedClasses.add("OTHERSEM:OTHERSEM");
    }

    //Apply the patterns and extract candidate phrases
    int num = 0;
    if (constVars.numThreads == 1)
      num = keyset.size();
    else
      num = keyset.size() / (constVars.numThreads - 1);
    ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
    List<Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>>> list = new ArrayList<Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>>>();
    for (int i = 0; i < constVars.numThreads; i++) {

      Callable<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> task = null;
      Map<TokenSequencePattern, Integer> patternsLearnedThisIterConverted = new HashMap<TokenSequencePattern , Integer>();
      for(Integer pindex : patternsLearnedThisIter.keySet()){
        SurfacePattern p = constVars.getPatternIndex().get(pindex);
        TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), p.toString(notAllowedClasses));
        patternsLearnedThisIterConverted.put(pat, pindex);
      }

      task = new ApplyPatternsMulti(sents, keyset.subList(i * num,
        Math.min(keyset.size(), (i + 1) * num)), patternsLearnedThisIterConverted, label,
        constVars.removeStopWordsFromSelectedPhrases,
        constVars.removePhrasesWithStopWords, constVars);

      Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> submit = executor
        .submit(task);
      list.add(submit);
    }

    // Now retrieve the result
    for (Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> future : list) {
      try{
        Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>> result = future
          .get();
        wordsandLemmaPatExtracted.addAll(result.first());
        matchedTokensByPat.addAll(result.second());
      }catch(Exception e){
        executor.shutdownNow();
        throw new RuntimeException(e);
      }
    }
    executor.shutdown();
  }
*/

  protected Map<E, Map<String, DataInstance>> getSentences(Map<E, Set<String>> sentids) {
    try{

      Set<File> files = new HashSet<>();

      Map<E, Map<String, DataInstance>> sentsAll  = new HashMap<>();
      CollectionValuedMap<String, E> sentIds2Pats = new CollectionValuedMap<>();
      for(Map.Entry<E, Set<String>> setEn: sentids.entrySet()){
        if(!sentsAll.containsKey(setEn.getKey()))
          sentsAll.put(setEn.getKey(), new HashMap<>());
        for(String s: setEn.getValue()){
          sentIds2Pats.add(s, setEn.getKey());
          if(constVars.batchProcessSents){
            File f = Data.sentId2File.get(s);
            assert f!= null : "How come no file for sentence " + s;
            files.add(f);
          }
        }
      }

      if(constVars.batchProcessSents){
        for(File f: files){
          Map<String, DataInstance> sentsf = IOUtils.readObjectFromFile(f);
          for(Map.Entry<String, DataInstance> s: sentsf.entrySet()){
            for(E pat: sentIds2Pats.get(s.getKey()))
              sentsAll.get(pat).put(s.getKey(), s.getValue());
          }
        }
      }else{
        for(Map.Entry<String, DataInstance> s: Data.sents.entrySet()){
          for(E pat: sentIds2Pats.get(s.getKey()))
            sentsAll.get(pat).put(s.getKey(), s.getValue());
        }
      }

//      /System.out.println("All sentences are " + sentsAll.entrySet().stream().map( x -> constVars.patternIndex.get(x.getKey())+":"+x.getValue()).collect(Collectors.toList()));
      return sentsAll;
    }catch(ClassNotFoundException e){
      throw new RuntimeException(e);
    }catch(IOException e1){
      throw new RuntimeException(e1);

    }
  }


  public void applyPats(Counter<E> patterns, String label, TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted,
                        CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat, Set<CandidatePhrase> alreadyLabeledWords){
 //   Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
 //   Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
//    Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();

    for(Map.Entry<String, Env> en: constVars.env.entrySet()){
      en.getValue().getVariables().putAll(ConstantsAndVariables.globalEnv.getVariables());
    }

    Map<E, Map<String, DataInstance>> sentencesForPatterns = getSentences(constVars.invertedIndex.queryIndex(patterns.keySet()));

    for(Map.Entry<E, Map<String, DataInstance>> en: sentencesForPatterns.entrySet()){
      runParallelApplyPats(en.getValue(), label, en.getKey(), wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords);
    }

    Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  }
  /*
  public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq,  TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted,
                        CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{
    Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
    Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
    Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();
    List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an");

    for(Entry<Integer, Double> en: patterns.entrySet()){
      Integer pindex = en.getKey();
      SurfacePattern p = constVars.getPatternIndex().get(pindex);
      String[] n = p.getSimplerTokensNext();
      String[] pr = p.getSimplerTokensPrev();
      boolean rest = false;
      if(n!=null){
        for(String e: n){
          if(!specialWords.contains(e)){
            rest = true;
            break;
          }
        }
      }
      if(rest == false && pr!=null){
        for(String e: pr){
          if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){
            rest = true;
            break;
          }
        }
      }
      if(rest)
        patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue());
      else
        patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue());
    }



    Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex());

    if (constVars.batchProcessSents) {
      List<File> filesToLoad;
      if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0)
        filesToLoad = Data.sentsFiles;
      else{
        filesToLoad = new ArrayList<File>();
        for (String fname : sentidswithfilerest.keySet()) {
          String filename;
//          if(!constVars.usingDirForSentsInIndex)
//            filename = constVars.saveSentencesSerDir+"/"+fname;
//          else
            filename = fname;
          filesToLoad.add(new File(filename));
        }
      }

      for (File fname : filesToLoad) {
        Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname);
        Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname);

        if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){

          String filename;
//          if(constVars.usingDirForSentsInIndex)
//            filename = constVars.saveSentencesSerDir+"/"+fname.getName();
//          else
            filename = fname.getAbsolutePath();

          Set<String> sentIDs = sentidswithfilerest.get(filename);
          if (sentIDs != null){
            this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
          } else
            Redwood.log(Redwood.DBG, "No sentIds for " + filename  + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet());
        }
        if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
          this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
        }

        if (computeDataFreq){
          Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
          Data.fileNamesUsedToComputeRawFreq.add(fname.getName());
        }
      }

      //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error.
      if(computeDataFreq){
        for(File f: Data.sentsFiles){
          if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){
            Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
            Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
            Data.fileNamesUsedToComputeRawFreq.add(f.getName());
          }
        }
      }

    } else {

      if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) {
        String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0);
        Set<String> sentids = sentidswithfilerest.get(filename);
        if (sentids != null) {
          this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
        } else
          throw new RuntimeException("How come no sentIds for " + filename  + ". Index keyset is " + constVars.invertedIndex.getKeySet());
      }
      if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
        this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
      }
      Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound);
    }
    Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  }
  */

  private void statsWithoutApplyingPatterns(Map<String, DataInstance> sents, PatternsForEachToken patternsForEachToken,
      Counter<E> patternsLearnedThisIter, TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted){
    for (Entry<String, DataInstance> sentEn : sents.entrySet()) {
      Map<Integer, Set<E>> pat4Sent = patternsForEachToken.getPatternsForAllTokens(sentEn.getKey());

      if (pat4Sent == null) {
        throw new RuntimeException("How come there are no patterns for "
            + sentEn.getKey());
      }
      for (Entry<Integer, Set<E>> en : pat4Sent
          .entrySet()) {
        CoreLabel token = null;
        Set<E> p1 = en.getValue();

//        Set<Integer> p1 = en.getValue().first();
//        Set<Integer> p2 = en.getValue().second();
//        Set<Integer> p3 = en.getValue().third();
        for (E index : patternsLearnedThisIter.keySet()) {

          if (p1.contains(index)) {
            if (token == null)
              token = sentEn.getValue().getTokens().get(en.getKey());
            wordsandLemmaPatExtracted.incrementCount(CandidatePhrase.createOrGet(token.word(), token.lemma()), index);
          }
        }
      }
    }
  }

  private Counter<CandidatePhrase> learnNewPhrasesPrivate(
    String label,
    PatternsForEachToken patternsForEachToken,
    Counter<E> patternsLearnedThisIter,
    Counter<E> allSelectedPatterns,
    Set<CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat,
    Counter<CandidatePhrase> scoreForAllWordsThisIteration,
    TwoDimensionalCounter<CandidatePhrase, E> terms,
    TwoDimensionalCounter<CandidatePhrase, E> wordsPatExtracted,
    TwoDimensionalCounter<E, CandidatePhrase> patternsAndWords4Label,
    String identifier, Set<CandidatePhrase> ignoreWords, boolean computeProcDataFreq) throws IOException, ClassNotFoundException {

    Set<CandidatePhrase> alreadyLabeledWords = new HashSet<>();
    if (constVars.doNotApplyPatterns) {
      // if want to get the stats by the lossy way of just counting without
      // applying the patterns
      ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
      while(sentsIter.hasNext()) {
        Pair<Map<String, DataInstance>, File> sentsf = sentsIter.next();
        this.statsWithoutApplyingPatterns(sentsf.first(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
      }
    } else {
      if (patternsLearnedThisIter.size() > 0) {
        this.applyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);

      }
    }
    if(computeProcDataFreq){
      if (!phraseScorer.wordFreqNorm.equals(Normalization.NONE)) {
        Redwood.log(Redwood.DBG, "computing processed freq");
        for (Entry<CandidatePhrase, Double> fq : Data.rawFreq.entrySet()) {
          Double in = fq.getValue();
          if (phraseScorer.wordFreqNorm.equals(Normalization.SQRT))
            in = Math.sqrt(in);

          else if (phraseScorer.wordFreqNorm.equals(Normalization.LOG))
            in = 1 + Math.log(in);
          else
            throw new RuntimeException("can't understand the normalization");

          assert !in.isNaN(): "Why is processed freq nan when rawfreq is " + in;

          Data.processedDataFreq.setCount(fq.getKey(), in);
        }
      } else
        Data.processedDataFreq = Data.rawFreq;
    }

    if (constVars.wordScoring.equals(WordScoring.WEIGHTEDNORM)) {

      for (CandidatePhrase en : wordsPatExtracted.firstKeySet()) {

        if (!constVars.getOtherSemanticClassesWords().contains(en) && (en.getPhraseLemma() ==null || !constVars.getOtherSemanticClassesWords().contains(CandidatePhrase.createOrGet(en.getPhraseLemma()))) && !alreadyLabeledWords.contains(en)){
          terms.addAll(en, wordsPatExtracted.getCounter(en));
        }
      }
      removeKeys(terms, constVars.getStopWords());

      Counter<CandidatePhrase> phraseScores = phraseScorer.scorePhrases(label,
          terms, wordsPatExtracted, allSelectedPatterns,
          alreadyIdentifiedWords, false);
      System.out.println("count for word U.S. is " + phraseScores.getCount(CandidatePhrase.createOrGet("U.S.")));
      Set<CandidatePhrase> ignoreWordsAll ;
      if(ignoreWords !=null && !ignoreWords.isEmpty()){
        ignoreWordsAll = CollectionUtils.unionAsSet(ignoreWords, constVars.getOtherSemanticClassesWords());
      }
      else
        ignoreWordsAll = new HashSet<>(constVars.getOtherSemanticClassesWords());

      ignoreWordsAll.addAll(constVars.getSeedLabelDictionary().get(label));
      ignoreWordsAll.addAll(constVars.getLearnedWords(label).keySet());
      System.out.println("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.contains(CandidatePhrase.createOrGet("U.S.")));

      Counter<CandidatePhrase> finalwords = chooseTopWords(phraseScores, terms,
          phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);

      phraseScorer.printReasonForChoosing(finalwords);

      scoreForAllWordsThisIteration.clear();
      Counters.addInPlace(scoreForAllWordsThisIteration, phraseScores);

      Redwood.log(
          ConstantsAndVariables.minimaldebug,
          "\n\n## Selected Words for " + label + " : "
              + Counters.toSortedString(finalwords, finalwords.size(),
                  "%1$s:%2$.2f", "\t"));

      if(constVars.goldEntities != null){
        Map<String, Boolean> goldEntities4Label = constVars.goldEntities.get(label);
        if(goldEntities4Label != null) {
          StringBuffer s = new StringBuffer();
          finalwords.keySet().stream().forEach(x ->
            s.append(x.getPhrase() + (goldEntities4Label.containsKey(x.getPhrase()) ? ":"+goldEntities4Label.get(x.getPhrase()) : ":UKNOWN")+"\n"));

          Redwood.log(ConstantsAndVariables.minimaldebug,
            "\n\n## Gold labels for selected words for label " + label + " : " + s.toString());
        } else
          Redwood.log(Redwood.DBG, "No gold entities provided for label " + label);
      }

      if (constVars.outDir != null && !constVars.outDir.isEmpty()) {
        String outputdir = constVars.outDir + "/" + identifier +"/"+ label;
        IOUtils.ensureDir(new File(outputdir));
        TwoDimensionalCounter<CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter<>();
        for (CandidatePhrase word : finalwords.keySet()) {
          for (E l : wordsPatExtracted.getCounter(word).keySet()) {
            for (CandidatePhrase w2 : patternsAndWords4Label.getCounter(l)) {
              reasonForWords.incrementCount(word, w2);
            }
          }
        }
        Redwood.log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
        String filename = outputdir + "/words.json";

        // the json object is an array corresponding to each iteration - of list
        // of objects,
        // each of which is a bean of entity and reasons

        JsonArrayBuilder obj = Json.createArrayBuilder();
        if (writtenInJustification.containsKey(label)
            && writtenInJustification.get(label)) {
          JsonReader jsonReader = Json.createReader(new BufferedInputStream(
              new FileInputStream(filename)));
          JsonArray objarr = jsonReader.readArray();
          for (JsonValue o : objarr)
            obj.add(o);
          jsonReader.close();

        }
        JsonArrayBuilder objThisIter = Json.createArrayBuilder();

        for (CandidatePhrase w : reasonForWords.firstKeySet()) {
          JsonObjectBuilder objinner = Json.createObjectBuilder();

          JsonArrayBuilder l = Json.createArrayBuilder();
          for (CandidatePhrase w2 : reasonForWords.getCounter(w).keySet()) {
            l.add(w2.getPhrase());
          }
          JsonArrayBuilder pats = Json.createArrayBuilder();
          for (E p : wordsPatExtracted.getCounter(w)) {
            pats.add(p.toStringSimple());
          }
          objinner.add("reasonwords", l);
          objinner.add("patterns", pats);
          objinner.add("score", finalwords.getCount(w));
          objinner.add("entity", w.getPhrase());
          objThisIter.add(objinner.build());
        }
        obj.add(objThisIter);

        // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
        // "Writing justification at " + filename);
        IOUtils.writeStringToFile(StringUtils.normalize(StringUtils.toAscii(obj.build().toString())), filename, "ASCII");
        writtenInJustification.put(label, true);
      }
      if (constVars.justify) {
        Redwood.log(Redwood.DBG, "\nJustification for phrases:\n");
        for (CandidatePhrase word : finalwords.keySet()) {
          Redwood.log(
              Redwood.DBG, "Phrase " +
              word
                  + " extracted because of patterns: \t"
                  + Counters.toSortedString(wordsPatExtracted.getCounter(word),
                      wordsPatExtracted.getCounter(word).size(), "%1$s:%2$f",
                      "\n"));
        }
      }
      // if (usePatternResultAsLabel)
      // if (answerLabel != null)
      // labelWords(sents, commonEngWords, finalwords.keySet(),
      // patterns.keySet(), outFile);
      // else
      // throw new RuntimeException("why is the answer label null?");

      return finalwords;
    } else if (constVars.wordScoring.equals(WordScoring.BPB)) {
      Counters.addInPlace(terms, wordsPatExtracted);
      Counter<CandidatePhrase> maxPatWeightTerms = new ClassicCounter<>();
      Map<CandidatePhrase, E> wordMaxPat = new HashMap<>();
      for (Entry<CandidatePhrase, ClassicCounter<E>> en : terms.entrySet()) {
        Counter<E> weights = new ClassicCounter<>();
        for (E k : en.getValue().keySet())
          weights.setCount(k, patternsLearnedThisIter.getCount(k));
        maxPatWeightTerms.setCount(en.getKey(), Counters.max(weights));
        wordMaxPat.put(en.getKey(), Counters.argmax(weights));
      }
      Counters.removeKeys(maxPatWeightTerms, alreadyIdentifiedWords);
      double maxvalue = Counters.max(maxPatWeightTerms);
      Set<CandidatePhrase> words = Counters.keysAbove(maxPatWeightTerms,
          maxvalue - 1e-10);
      CandidatePhrase bestw = null;
      if (words.size() > 1) {
        double max = Double.NEGATIVE_INFINITY;
        for (CandidatePhrase w : words) {
          if (terms.getCount(w, wordMaxPat.get(w)) > max) {
            max = terms.getCount(w, wordMaxPat.get(w));
            bestw = w;
          }
        }
      } else if (words.size() == 1)
        bestw = words.iterator().next();
      else
        return new ClassicCounter<>();

      Redwood.log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);

      return Counters.asCounter(Arrays.asList(bestw));
    }

    else
      throw new RuntimeException("wordscoring " + constVars.wordScoring
          + " not identified");
  }

  // private void combineExternalFeatures(Counter<String> words) {
  //
  // for (Entry<String, Double> en : words.entrySet()) {
  // Integer num = constVars.distSimClusters.get(en.getKey());
  // if (num == null)
  // num = -1;
  // // Double score = externalWeights.getCount(num);
  // // if not present in the clusters, take minimum of the scores of the
  // // individual words
  // // if (num == null) {
  // // for (String w : en.getKey().split("\\s+")) {
  // // Integer n = constVars.distSimClusters.get(w);
  // // if (n == null)
  // // continue;
  // // score = Math.min(score, externalWeights.getCount(n));
  // // }
  // // }
  // words.setCount(en.getKey(), en.getValue() *
  // constVars.distSimWeights.getCount(num));
  // }
  // }

  Counter<String> getLearnedScores() {
    return phraseScorer.getLearnedScores();
  }

  // private Counter<String> getLookAheadWeights(Counter<String> words,
  // Counter<String> externalWordWeights, Set<String> alreadyIdentifiedWords,
  // String label,
  // Counter<SurfacePattern> currentAllPatternWeights,
  // TwoDimensionalCounter<SurfacePattern, String> allPatternsandWords) throws
  // IOException {
  // System.out.println("size of patterns weight counter is " +
  // currentAllPatternWeights.size());
  //
  // DirectedWeightedMultigraph<String, DefaultWeightedEdge> graph = new
  // DirectedWeightedMultigraph<String,
  // DefaultWeightedEdge>(org.jgrapht.graph.DefaultWeightedEdge.class);
  //
  // if (Data.googleNGram.size() == 0) {
  // Data.loadGoogleNGrams();
  // }
  //
  // TwoDimensionalCounter<String, SurfacePattern> allPatsAndWords =
  // TwoDimensionalCounter.reverseIndexOrder(allPatternsandWords);
  // System.out.println("We have patterns for " + allPatsAndWords.size() +
  // " words ");
  // TwoDimensionalCounter<String, String> lookaheadweights = new
  // TwoDimensionalCounter<String, String>();
  // // Counter<String> weights = new ClassicCounter<String>();
  //
  // for (Entry<String, Double> en : words.entrySet()) {
  // Counter<SurfacePattern> pats = new
  // ClassicCounter<SurfacePattern>(allPatsAndWords.getCounter(en.getKey()));
  // for (SurfacePattern p : pats.keySet()) {
  // pats.setCount(p, pats.getCount(p) * currentAllPatternWeights.getCount(p));
  // }
  //
  // for (Pair<SurfacePattern, Double> p : Counters.topKeysWithCounts(pats, 10))
  // {
  //
  // for (Entry<String, Double> pen :
  // allPatternsandWords.getCounter(p.first()).entrySet()) {
  // if (pen.getKey().equals(en.getKey()) ||
  // alreadyIdentifiedWords.contains(pen.getKey()) ||
  // constVars.otherSemanticClasses.contains(pen.getKey()))
  // continue;
  //
  // double ngramWt = 1.0;
  // if (Data.googleNGram.containsKey(pen.getKey())) {
  // assert (Data.rawFreq.containsKey(pen.getKey()));
  // ngramWt = (1 + Data.rawFreq.getCount(pen.getKey())) / (Data.rawFreq.size()
  // + Data.googleNGram.getCount(pen.getKey()));
  // }
  // double wordweight = ngramWt;// (minExternalWordWeight +
  // // externalWordWeights.getCount(pen.getKey()))
  // // * p.second() * (0.1 +
  // // currentAllPatternWeights.getCount(p.first()))
  // // * ;
  // // if (wordweight != 0)
  // if (wordweight == 0) {
  // // System.out.println("word weight is zero for " + pen.getKey() +
  // // " and the weights were " +
  // // externalWordWeights.getCount(pen.getKey()) + ";" + p.second() +
  // // ";"
  // // + (0.1 + currentPatternWeights.getCount(p.first())) + ";" +
  // // ngramWt);
  // } else {
  // lookaheadweights.setCount(en.getKey(), pen.getKey(), Math.log(wordweight));
  // graph.addVertex(en.getKey());
  // graph.addVertex(pen.getKey());
  // DefaultWeightedEdge e = graph.addEdge(en.getKey(), pen.getKey());
  // graph.setEdgeWeight(e, lookaheadweights.getCount(en.getKey(),
  // pen.getKey()));
  // }
  //
  // }
  //
  // }
  // // weights.setCount(en.getKey(),
  // // Math.exp(Counters(lookaheadweights.getCounter(en.getKey()))));
  //
  // }
  // Counter<String> weights = new ClassicCounter<String>();
  // for (Entry<String, ClassicCounter<String>> en :
  // lookaheadweights.entrySet()) {
  // // List<Pair<String, Double>> sorted =
  // // Counters.toSortedListWithCounts(en.getValue());
  // // double val = sorted.get((int) Math.floor(sorted.size() / 2)).second();
  // double wt = Math.exp(en.getValue().totalCount() / en.getValue().size());
  //
  // weights.setCount(en.getKey(), wt);
  // }
  // // Counters.expInPlace(weights);
  // // List<String> tk = Counters.topKeys(weights, 50);
  // // BufferedWriter w = new BufferedWriter(new FileWriter("lookahead_" +
  // // answerLabel, true));
  // // for (String s : tk) {
  // // w.write(s + "\t" + weights.getCount(s) + "\t" +
  // // lookaheadweights.getCounter(s) + "\n");
  // // }
  // // w.close();
  // // BufferedWriter writer = new BufferedWriter(new FileWriter("graph.gdf"));
  // // writeGraph(writer, graph);
  // System.out.println("done writing graph");
  // Redwood.log(ConstantsAndVariables.minimaldebug, "calculated look ahead weights for " +
  // weights.size() + " words");
  //
  // return weights;
  // }

  // void writeGraph(BufferedWriter w, DirectedWeightedMultigraph<String,
  // DefaultWeightedEdge> g) throws IOException {
  // w.write("nodedef>name VARCHAR\n");
  // for (String n : g.vertexSet()) {
  // w.write(n + "\n");
  // }
  // w.write("edgedef>node1 VARCHAR,node2 VARCHAR, weight DOUBLE\n");
  // for (DefaultWeightedEdge e : g.edgeSet()) {
  // w.write(g.getEdgeSource(e) + "," + g.getEdgeTarget(e) + "," +
  // g.getEdgeWeight(e) + "\n");
  // }
  // w.close();
  // }

}