GrammarCompactionTester.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.fsm.*;
import edu.stanford.nlp.io.NumberRangeFileFilter;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.stats.ClassicCounter;
import java.util.*;


/**
 * @author Teg Grenager (grenager@cs.stanford.edu)
 */
public class GrammarCompactionTester  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(GrammarCompactionTester.class);

  // for debugging
  //  public static MergeableGraph debugGraph = null;

  ExhaustivePCFGParser parser = null;
  ExhaustiveDependencyParser dparser = null;
  BiLexPCFGParser bparser = null;
  Scorer scorer = null;
  Options op;

  //  TreebankLangParserParams tlpParams = new EnglishTreebankParserParams();
  // tlpParams may be changed to something else later, so don't use it till
  // after options are parsed.


  GrammarCompactor compactor = null;

  Map<String, List<List<String>>> allTestPaths = Generics.newHashMap();
  Map<String, List<List<String>>> allTrainPaths = Generics.newHashMap();

  String asciiOutputPath = null;
  String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
  int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;

  String suffixOrderString = null;
  String minArcNumString = null;
  String maxMergeCostString = null;
  String sizeCutoffString = null;
  String minPortionArcsString = null;
  String ignoreUnsupportedSuffixesString = "false";
  String splitParamString = null;
  String costModelString = null;
  String verboseString = null;
  String minArcCostString = null;
  String trainThresholdString = null;
  String heldoutThresholdString = null;
  int markovOrder = -1;
  String smoothParamString = null;
  String scoringData = null;
  String allowEpsilonsString = null;
  boolean saveGraphs = false;
  private int indexRangeLow;
  private int indexRangeHigh;
  private String outputFile = null;
  private String inputFile = null;
  private boolean toy = false;

  /**
   */
  public Map<String,List<List<String>>> extractPaths(String path, int low, int high, boolean annotate) {

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank(); // this is a new one
    TreebankLanguagePack tlp = op.langpack();

    trainTreebank.loadPath(path, new NumberRangeFileFilter(low, high, true));

    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, op.trainOptions.selectiveSplitCutOff, op.tlpParams.treebankLanguagePack());
    }
    if (op.trainOptions.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, op.trainOptions.selectivePostSplitCutOff, op.tlpParams.treebankLanguagePack());
    }

    List<Tree> trainTrees = new ArrayList<>();
    HeadFinder hf = null;
    if (op.trainOptions.leftToRight) {
      hf = new LeftHeadFinder();
    } else {
      hf = op.tlpParams.headFinder();
    }
    TreeTransformer annotator = new TreeAnnotator(hf, op.tlpParams, op);
    for (Tree tree : trainTreebank) {
      if (annotate) {
        tree = annotator.transformTree(tree);
      }
      trainTrees.add(tree);
    }
    Extractor<Map<String,List<List<String>>>> pExtractor = new PathExtractor(hf, op);
    Map<String,List<List<String>>> allPaths = pExtractor.extract(trainTrees);
    return allPaths;
  }


  public static void main(String[] args) {
    new GrammarCompactionTester().runTest(args);
  }

  public void runTest(String[] args) {
    System.out.println("Currently " + new Date());
    System.out.print("Invoked with arguments:");
    for (String arg : args) {
      System.out.print(" " + arg);
    }

    System.out.println();

    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
      if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
        path = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-saveToAscii") && (i + 1 < args.length)) {
        asciiOutputPath = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
        trainLow = Integer.parseInt(args[i + 1]);
        trainHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
        testLow = Integer.parseInt(args[i + 1]);
        testHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-index") && (i + 2 < args.length)) {
        indexRangeLow = Integer.parseInt(args[i + 1]);
        indexRangeHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-outputFile")) {
        outputFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-inputFile")) {
        inputFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-suffixOrder")) {
        suffixOrderString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-minArcNum")) {
        minArcNumString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-maxMergeCost")) {
        maxMergeCostString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-sizeCutoff")) {
        sizeCutoffString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-minPortionArcs")) {
        minPortionArcsString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-ignoreUnsupportedSuffixes")) {
        ignoreUnsupportedSuffixesString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-trainThreshold")) {
        trainThresholdString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-heldoutThreshold")) {
        heldoutThresholdString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-minArcCost")) {
        minArcCostString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-splitParam")) {
        splitParamString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-costModel")) {
        costModelString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-scoringData")) {
        scoringData = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-verbose")) {
        verboseString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-allowEpsilons")) {
        allowEpsilonsString = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-saveGraphs")) {
        saveGraphs = true;
        i++;
      } else if (args[i].equalsIgnoreCase("-toy")) {
        toy = true;
        i++;
      } else if (args[i].equalsIgnoreCase("-markovOrder")) {
        markovOrder = Integer.parseInt(args[i + 1]);
        i += 2;
      } else if (args[i].equalsIgnoreCase("-smoothParam")) {
        smoothParamString = args[i + 1];
        i += 2;
      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }

    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    if (op.trainOptions.compactGrammar() == 4) {
      System.out.println("Instantiating fsm.LossyGrammarCompactor");
      try {
        Class[] argTypes = new Class[13];
        Class strClass = String.class;
        for (int j = 0; j < argTypes.length; j++) {
          argTypes[j] = strClass;
        }
        Object[] cArgs = new Object[13];
        cArgs[0] = suffixOrderString;
        cArgs[1] = minArcNumString;
        cArgs[2] = trainThresholdString;
        cArgs[3] = heldoutThresholdString;
        cArgs[4] = sizeCutoffString;
        cArgs[5] = minPortionArcsString;
        cArgs[6] = splitParamString;
        cArgs[7] = ignoreUnsupportedSuffixesString;
        cArgs[8] = minArcCostString;
        cArgs[9] = smoothParamString;
        cArgs[10] = costModelString;
        cArgs[11] = scoringData;
        cArgs[12] = verboseString;
        compactor = (GrammarCompactor) Class.forName("fsm.LossyGrammarCompactor").getConstructor(argTypes).newInstance(cArgs);
      } catch (Exception e) {
        log.info("Couldn't instantiate GrammarCompactor: " + e);
        e.printStackTrace();
      }
    } else if (op.trainOptions.compactGrammar() == 5) {
      System.out.println("Instantiating fsm.CategoryMergingGrammarCompactor");
      try {
        Class[] argTypes = new Class[6];
        Class strClass = String.class;
        for (int j = 0; j < argTypes.length; j++) {
          argTypes[j] = strClass;
        }
        Object[] cArgs = new Object[6];
        cArgs[0] = splitParamString;
        cArgs[1] = trainThresholdString;
        cArgs[2] = heldoutThresholdString;
        cArgs[3] = minArcCostString;
        cArgs[4] = ignoreUnsupportedSuffixesString;
        cArgs[5] = smoothParamString;
        compactor = (GrammarCompactor) Class.forName("fsm.CategoryMergingGrammarCompactor").getConstructor(argTypes).newInstance(cArgs);
      } catch (Exception e) {
        throw new RuntimeException("Couldn't instantiate CategoryMergingGrammarCompactor." + e);
      }
    } else if (op.trainOptions.compactGrammar() == 3) {
      System.out.println("Instantiating fsm.ExactGrammarCompactor");
      compactor = new ExactGrammarCompactor(op, saveGraphs, true);
    } else if (op.trainOptions.compactGrammar() > 0) {
    }

    if (markovOrder >= 0) {
      op.trainOptions.markovOrder = markovOrder;
      op.trainOptions.hSelSplit = false;
    }
    if (toy) {
      buildAndCompactToyGrammars();
    } else {
      testGrammarCompaction();
    }
  }

  /*
  private static void testOneAtATimeMerging() {

    // use the parser constructor to extract the grammars from the treebank once
    LexicalizedParser lp = new LexicalizedParser(path, new NumberRangeFileFilter(trainLow, trainHigh, true), tlpParams);

    ParserData pd = lp.parserData();
    Pair originalGrammar = new Pair(pd.ug, pd.bg);

    // extract a bunch of paths
    Timing.startTime();
    System.out.print("Extracting other paths...");
    allTrainPaths = extractPaths(path, trainLow, trainHigh, true);
    allTestPaths = extractPaths(path, testLow, testHigh, true);
    Timing.tick("done");

    List mergePairs = null;
    if (inputFile != null) {
      // read merge pairs from file and do them and parse
      System.out.println("getting pairs from file: " + inputFile);
      mergePairs = getMergePairsFromFile(inputFile);
    }
    // try one merge at a time and parse afterwards
    Numberer originalNumberer = Numberer.getGlobalNumberer("states");
    String header = "index\tmergePair\tmergeCost\tparseF1\n";
    StringUtils.printToFile(outputFile, header, true);

    for (int i = indexRangeLow; i < indexRangeHigh; i++) {

      Timing.startTime();
      Numberer.getNumberers().put("states", originalNumberer);
      if (mergePairs != null)
        System.out.println("passing merge pairs to compactor: " + mergePairs);
      CategoryMergingGrammarCompactor compactor = new CategoryMergingGrammarCompactor(mergePairs, i);
      System.out.println("Compacting grammars with index " + i);
      Pair compactedGrammar = compactor.compactGrammar(originalGrammar, allTrainPaths, allTestPaths);
      Pair mergePair = null;
      double mergeCosts = Double.NEGATIVE_INFINITY;
      List mergeList = compactor.getCompletedMergeList();
      if (mergeList != null && mergeList.size() > 0) {
        mergePair = (Pair) mergeList.get(0);
        mergeCosts = compactor.getActualScores().getCount(mergePair);
      }


      ParserData newPd = new ParserData(pd.lex,
                                        (BinaryGrammar) compactedGrammar.second, (UnaryGrammar) compactedGrammar.first,
                                        pd.dg, pd.numbs, pd.pt);

      lp = new LexicalizedParser(newPd);
      Timing.tick("done.");

      Treebank testTreebank = tlpParams.testMemoryTreebank();
      testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
      System.out.println("Currently " + new Date());
      double f1 = lp.testOnTreebank(testTreebank);
      System.out.println("Currently " + new Date());

      String resultString = i + "\t" + mergePair + "\t" + mergeCosts + "\t" + f1 + "\n";
      StringUtils.printToFile(outputFile, resultString, true);
    }
  }

  private static List getMergePairsFromFile(String filename) {
    List result = new ArrayList();
    try {
      String fileString = StringUtils.slurpFile(new File(filename));
      StringTokenizer st = new StringTokenizer(fileString);
      while (st.hasMoreTokens()) {
        String token1 = st.nextToken();
        if (st.hasMoreTokens()) {
          String token2 = st.nextToken();
          UnorderedPair pair = new UnorderedPair(token1, token2);
          result.add(pair);
        }
      }
    } catch (Exception e) {
      throw new RuntimeException("couldn't access file: " + filename);
    }
    return result;
  }
*/
  /*
//    System.out.println(MergeableGraph.areIsomorphic(graphs[0], graphs[1], graphs[0].getStartNode(), graphs[1].getStartNode()));
//    System.out.println(MergeableGraph.areIsomorphic(graphs[1], graphs[2], graphs[1].getStartNode(), graphs[2].getStartNode()));
//    System.out.println(MergeableGraph.areIsomorphic(graphs[2], graphs[0], graphs[2].getStartNode(), graphs[0].getStartNode()));

  // now go through the grammars themselves and see if they are equal
  System.out.println("UR 0 and 1: " + equalsUnary(((UnaryGrammar)grammars[0].first).rules(),((UnaryGrammar)grammars[1].first).rules()));
  System.out.println("UR 1 and 2: "  + equalsUnary(((UnaryGrammar)grammars[1].first).rules(),((UnaryGrammar)grammars[2].first).rules()));
  System.out.println("UR 2 and 0: "  + equalsUnary(((UnaryGrammar)grammars[2].first).rules(),((UnaryGrammar)grammars[0].first).rules()));

  System.out.println("BR 0 and 1: "  + equalsBinary(((BinaryGrammar)grammars[0].second).rules(),((BinaryGrammar)grammars[1].second).rules()));
  System.out.println("BR 1 and 2: " + equalsBinary(((BinaryGrammar)grammars[1].second).rules(),((BinaryGrammar)grammars[2].second).rules()));
  System.out.println("BR 2 and 0: " + equalsBinary(((BinaryGrammar)grammars[2].second).rules(),((BinaryGrammar)grammars[0].second).rules()));

    System.exit(0);

  // now go through the grammars we made and see if they are equal!
  Set[] unaryRules = new Set[3];
  Set[] binaryRules = new Set[3];
  for (int i=0; i<grammars.length; i++) {
    unaryRules[i] = new HashSet();
    System.out.println(i + " size: " + ((UnaryGrammar)grammars[i].first()).numRules());
    for (Iterator unRuleI = ((UnaryGrammar)grammars[i].first()).iterator(); unRuleI.hasNext();) {
      UnaryRule ur = (UnaryRule) unRuleI.next();
      String parent = (String) stateNumberers[i].object(ur.parent);
      String child = (String) stateNumberers[i].object(ur.child);
      unaryRules[i].add(new StringUnaryRule(parent, child, ur.score));
    }
    binaryRules[i] = new HashSet();
    System.out.println(i + " size: " + ((BinaryGrammar)grammars[i].second()).numRules());
    for (Iterator binRuleI = ((BinaryGrammar)grammars[i].second()).iterator(); binRuleI.hasNext();) {
      BinaryRule br = (BinaryRule) binRuleI.next();
      String parent = (String) stateNumberers[i].object(br.parent);
      String leftChild = (String) stateNumberers[i].object(br.leftChild);
      String rightChild = (String) stateNumberers[i].object(br.rightChild);
      binaryRules[i].add(new StringBinaryRule(parent, leftChild, rightChild, br.score));
    }
  }

  System.out.println("uR 0 and 1: " + equals(unaryRules[0],unaryRules[1]));
  System.out.println("uR 1 and 2: " + equals(unaryRules[1],unaryRules[2]));
  System.out.println("uR 2 and 0: " + equals(unaryRules[2],unaryRules[0]));

  System.out.println("bR 0 and 1: " + equals(binaryRules[0],binaryRules[1]));
  System.out.println("bR 1 and 2: " + equals(binaryRules[1],binaryRules[2]));
  System.out.println("bR 2 and 0: " + equals(binaryRules[2],binaryRules[0]));

}
*/

  /*
    public static void testCategoryMergingProblem() {
      LexicalizedParser lp = new LexicalizedParser(path, new NumberRangeFileFilter(trainLow, trainHigh, true), tlpParams);

      // test it without the change
      Treebank testTreebank = tlpParams.testMemoryTreebank();
      testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
      System.out.println("Currently " + new Date());
      lp.testOnTreebank(testTreebank);
      System.out.println("Currently " + new Date());

      // pull out the rules and consistently change the name of one of the states
      ParserData pd = lp.parserData();
      BinaryGrammar bg = pd.bg;
      UnaryGrammar ug = pd.ug;
      Numberer stateNumberer = Numberer.getGlobalNumberer("states");
      UnaryGrammar newUG = new UnaryGrammar(stateNumberer.total()+1);
      for (Iterator urIter = ug.iterator(); urIter.hasNext();) {
        UnaryRule rule = (UnaryRule) urIter.next();
        rule.parent = changeIfNecessary(rule.parent, stateNumberer);
        rule.child = changeIfNecessary(rule.child, stateNumberer);
        newUG.addRule(rule);
      }
      BinaryGrammar newBG = new BinaryGrammar(stateNumberer.total()+1);
      for (Iterator urIter = bg.iterator(); urIter.hasNext();) {
        BinaryRule rule = (BinaryRule) urIter.next();
        rule.parent = changeIfNecessary(rule.parent, stateNumberer);
        rule.leftChild = changeIfNecessary(rule.leftChild, stateNumberer);
        rule.rightChild = changeIfNecessary(rule.rightChild, stateNumberer);
        newBG.addRule(rule);
      }
      newUG.purgeRules();
      newBG.splitRules();
      pd.ug = newUG;
      pd.bg = newBG;
      lp = new LexicalizedParser(pd);

      // test it with the change
      testTreebank = tlpParams.testMemoryTreebank();
      testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
      System.out.println("Currently " + new Date());
      lp.testOnTreebank(testTreebank);
      System.out.println("Currently " + new Date());
    }
  */

  public Pair<UnaryGrammar, BinaryGrammar> translateAndSort(Pair<UnaryGrammar, BinaryGrammar> grammar, Index<String> oldIndex, Index<String> newIndex) {
    System.out.println("oldIndex.size()" + oldIndex.size() + " newIndex.size()" + newIndex.size());
    UnaryGrammar ug = grammar.first;
    List<UnaryRule> unaryRules = new ArrayList<>();
    for (UnaryRule rule : ug.rules()) {
      rule.parent = translate(rule.parent, oldIndex, newIndex);
      rule.child = translate(rule.child, oldIndex, newIndex);
      unaryRules.add(rule);
    }
    Collections.sort(unaryRules);

    UnaryGrammar newUG = new UnaryGrammar(newIndex);
    for (UnaryRule unaryRule : unaryRules) {
      newUG.addRule(unaryRule);
    }
    newUG.purgeRules();

    BinaryGrammar bg = grammar.second;
    List<BinaryRule> binaryRules = new ArrayList<>();
    for (BinaryRule rule : bg.rules()) {
      rule.parent = translate(rule.parent, oldIndex, newIndex);
      rule.leftChild = translate(rule.leftChild, oldIndex, newIndex);
      rule.rightChild = translate(rule.rightChild, oldIndex, newIndex);
      binaryRules.add(rule);
    }
    Collections.sort(unaryRules);

    BinaryGrammar newBG = new BinaryGrammar(newIndex);
    for (BinaryRule binaryRule : binaryRules) {
      newBG.addRule(binaryRule);
    }
    newBG.splitRules();

    return Generics.newPair(newUG, newBG);
  }

  private static int translate(int i, Index<String> oldIndex, Index<String> newIndex) {
    return newIndex.addToIndex(oldIndex.get(i));
  }

  // WTF is this?
  public int changeIfNecessary(int i, Index<String> n) {
    String s = n.get(i);
    if (s.equals("NP^PP")) {
      System.out.println("changed");
      return n.addToIndex("NP-987928374");
    }
    return i;
  }

  public boolean equalsBinary(List<BinaryRule> l1, List<BinaryRule> l2) {
    // put each into a map to itself
    Map<BinaryRule, BinaryRule> map1 = Generics.newHashMap();
    for (BinaryRule o : l1) {
      map1.put(o, o);
    }
    Map<BinaryRule, BinaryRule> map2 = Generics.newHashMap();
    for (BinaryRule o : l2) {
      map2.put(o, o);
    }
    boolean isEqual = true;
    for (BinaryRule rule1 : map1.keySet()) {
      BinaryRule rule2 = map2.get(rule1);
      if (rule2 == null) {
        System.out.println("no rule for " + rule1);
        isEqual = false;
      } else {
        map2.remove(rule2);
        if (rule1.score != rule2.score) {
          System.out.println(rule1 + " and " + rule2 + " have diff scores");
          isEqual = false;
        }
      }
    }
    System.out.println("left over: " + map2.keySet());
    return isEqual;
  }

  public boolean equalsUnary(List<UnaryRule> l1, List<UnaryRule> l2) {
    // put each into a map to itself
    Map<UnaryRule, UnaryRule> map1 = Generics.newHashMap();
    for (UnaryRule o : l1) {
      map1.put(o, o);
    }
    Map<UnaryRule, UnaryRule> map2 = Generics.newHashMap();
    for (UnaryRule o : l2) {
      map2.put(o, o);
    }
    boolean isEqual = true;
    for (UnaryRule rule1 : map1.keySet()) {
      UnaryRule rule2 = map2.get(rule1);
      if (rule2 == null) {
        System.out.println("no rule for " + rule1);
        isEqual = false;
      } else {
        map2.remove(rule2);
        if (rule1.score != rule2.score) {
          System.out.println(rule1 + " and " + rule2 + " have diff scores");
          isEqual = false;
        }
      }
    }
    System.out.println("left over: " + map2.keySet());
    return isEqual;
  }

  private static <T> boolean equalSets(Set<T> set1, Set<T> set2) {
    boolean isEqual = true;
    if (set1.size() != set2.size()) {
      System.out.println("sizes different: " + set1.size() + " vs. " + set2.size());
      isEqual = false;
    }
    Set<T> newSet1 = (Set<T>) ((HashSet<T>) set1).clone();
    newSet1.removeAll(set2);
    if (newSet1.size() > 0) {
      isEqual = false;
      System.out.println("set1 left with: " + newSet1);
    }
    Set<T> newSet2 = (Set<T>) ((HashSet<T>) set2).clone();
    newSet2.removeAll(set1);
    if (newSet2.size() > 0) {
      isEqual = false;
      System.out.println("set2 left with: " + newSet2);
    }
    return isEqual;
  }

  /*
  public static void testAutomatonCompaction() {
    // make our LossyAutomatonCompactor from the parameters passed at command line
    // now set up the compactor2 constructor args
    // extract a bunch of paths
    Timing.startTime();
    System.out.print("Extracting paths from treebank...");
    allTrainPaths = extractPaths(path, trainLow, trainHigh, false);
    allTestPaths = extractPaths(path, testLow, testHigh, false);
    Timing.tick("done");

    // for each category, construct an automaton and then compact it
    for (Iterator catIter = allTrainPaths.keySet().iterator(); catIter.hasNext();) {
      // construct an automaton from the paths
      String category = (String) catIter.next();
      List trainPaths = (List) allTrainPaths.get(category);
      List testPaths = (List) allTestPaths.get(category);
      if (testPaths == null) testPaths = new ArrayList();
      // now make the graph with the training paths (the LossyAutomatonCompactor will reestimate the weights anyway)
      TransducerGraph graph = TransducerGraph.createGraphFromPaths(trainPaths, 3);
      System.out.println("Created graph for: " + category);

      System.out.println();
      int numArcs1 = graph.getArcs().size();

      LossyAutomatonCompactor compactor = new LossyAutomatonCompactor(3, // horizonOrder, 1 means that only exactly compatible merges are considered
								      0, // min nmber of arcs
								      10000000.0, // maxMergeCost
								      0.5, // splitParam
								      false, //  ignoreUnsupportedSuffixes
                      -1000, // minArcCost
								      trainPaths,
								      testPaths,
								      LossyAutomatonCompactor.DATA_LIKELIHOOD_COST, // costModel
								      false); // verbose

      TransducerGraph result = compactor.compactFA(graph);
      //do we need this?      result = new TransducerGraph(result, ntsp);  // pull out strings from sets returned by minimizer
      int numArcs2 = result.getArcs().size();
      System.out.println("LossyGrammarCompactor compacted "+category+" from " + numArcs1 + " to " + numArcs2 + " arcs");

    }


  }
*/
  private static <T> int numTokens(List<List<T>> paths) {
    int result = 0;
    for (List<T> path : paths) {
      result += path.size();
    }
    return result;
  }

  public void buildAndCompactToyGrammars() {
    // extract a bunch of paths
    System.out.print("Extracting other paths...");
    allTrainPaths = extractPaths(path, trainLow, trainHigh, true);
    TransducerGraph.NodeProcessor ntsp = new TransducerGraph.SetToStringNodeProcessor(new PennTreebankLanguagePack());
    TransducerGraph.NodeProcessor otsp = new TransducerGraph.ObjectToSetNodeProcessor();
    TransducerGraph.ArcProcessor isp = new TransducerGraph.InputSplittingProcessor();
    TransducerGraph.ArcProcessor ocp = new TransducerGraph.OutputCombiningProcessor();
    TransducerGraph.GraphProcessor normalizer = new TransducerGraph.NormalizingGraphProcessor(false);
    TransducerGraph.GraphProcessor quasiDeterminizer = new QuasiDeterminizer();
    AutomatonMinimizer exactMinimizer = new FastExactAutomatonMinimizer();
    for (String key : allTrainPaths.keySet()) {
      System.out.println("creating graph for " + key);
      List<List<String>> paths = allTrainPaths.get(key);
      ClassicCounter<List<String>> pathCounter = new ClassicCounter<>();
      for (List<String> o : paths) {
        pathCounter.incrementCount(o);
      }
      ClassicCounter<List<String>> newPathCounter = removeLowCountPaths(pathCounter, 2);
      paths.retainAll(newPathCounter.keySet()); // get rid of the low count ones
      TransducerGraph result = TransducerGraph.createGraphFromPaths(newPathCounter, 1000);
      // exact compaction
      int numArcs = result.getArcs().size();
      int numNodes = result.getNodes().size();
      if (numArcs == 0) {
        continue;
      }
      System.out.println("initial graph has " + numArcs + " arcs and " + numNodes + " nodes.");
      GrammarCompactor.writeFile(result, "unminimized", key);
      // do exact minimization
      result = normalizer.processGraph(result); // normalize it so that exact minimization works properly
      result = quasiDeterminizer.processGraph(result); // push probabilities left or down
      result = new TransducerGraph(result, ocp); // combine outputs into inputs
      result = exactMinimizer.minimizeFA(result); // minimize the thing
      result = new TransducerGraph(result, ntsp);  // pull out strings from sets returned by minimizer
      result = new TransducerGraph(result, isp); // split outputs from inputs
      numArcs = result.getArcs().size();
      numNodes = result.getNodes().size();

      System.out.println("after exact minimization graph has " + numArcs + " arcs and " + numNodes + " nodes.");
      GrammarCompactor.writeFile(result, "exactminimized", key);

      // do additional lossy minimization
      /*
      NewLossyAutomatonCompactor compactor2 = new NewLossyAutomatonCompactor(paths, true);
      result = compactor2.compactFA(result);
      result = new TransducerGraph(result, ntsp);  // pull out strings from sets returned by minimizer
      numArcs = result.getArcs().size();
      numNodes = result.getNodes().size();

      System.out.println("after lossy minimization graph has " + numArcs + " arcs and " + numNodes + " nodes.");
      GrammarCompactor.writeFile(result, "lossyminimized", key);
      */
    }
  }

  private static ClassicCounter<List<String>> removeLowCountPaths(ClassicCounter<List<String>> paths, double thresh) {
    ClassicCounter<List<String>> result = new ClassicCounter<>();
    int numRetained = 0;
    for (List<String> path : paths.keySet()) {
      double count = paths.getCount(path);
      if (count >= thresh) {
        result.setCount(path, count);
        numRetained++;
      }
    }
    System.out.println("retained " + numRetained);
    return result;
  }

  public void testGrammarCompaction() {

    // these for testing against the markov 3rd order baseline

    // use the parser constructor to extract the grammars from the treebank
    op = new Options();
    LexicalizedParser lp = LexicalizedParser.trainFromTreebank(path, new NumberRangeFileFilter(trainLow, trainHigh, true), op);

    // compact grammars
    if (compactor != null) {

      // extract a bunch of paths
      Timing.startTime();
      System.out.print("Extracting other paths...");
      allTrainPaths = extractPaths(path, trainLow, trainHigh, true);
      allTestPaths = extractPaths(path, testLow, testHigh, true);
      Timing.tick("done");

      // compact grammars
      Timing.startTime();
      System.out.print("Compacting grammars...");
      Pair<UnaryGrammar, BinaryGrammar> grammar = Generics.newPair(lp.ug, lp.bg);
      Triple<Index<String>, UnaryGrammar, BinaryGrammar> compactedGrammar = compactor.compactGrammar(grammar, allTrainPaths, allTestPaths, lp.stateIndex);
      lp.stateIndex = compactedGrammar.first();
      lp.ug = compactedGrammar.second();
      lp.bg = compactedGrammar.third();

      Timing.tick("done.");
    }

    if (asciiOutputPath != null) {
      lp.saveParserToTextFile(asciiOutputPath);
    }

    // test it
    Treebank testTreebank = op.tlpParams.testMemoryTreebank();
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    System.out.println("Currently " + new Date());
    EvaluateTreebank evaluator = new EvaluateTreebank(lp);
    evaluator.testOnTreebank(testTreebank);
    System.out.println("Currently " + new Date());
  }

}

class StringUnaryRule {
  public String parent;
  public String child;
  public double score;

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (!(o instanceof StringUnaryRule)) {
      return false;
    }

    final StringUnaryRule stringUnaryRule = (StringUnaryRule) o;

    if (score != stringUnaryRule.score) {
      return false;
    }
    if (child != null ? !child.equals(stringUnaryRule.child) : stringUnaryRule.child != null) {
      return false;
    }
    if (parent != null ? !parent.equals(stringUnaryRule.parent) : stringUnaryRule.parent != null) {
      return false;
    }

    return true;
  }

  @Override
  public int hashCode() {
    int result;
    long temp;
    result = (parent != null ? parent.hashCode() : 0);
    result = 29 * result + (child != null ? child.hashCode() : 0);
    temp = Double.doubleToLongBits(score);
    result = 29 * result + (int) (temp ^ (temp >>> 32));
    return result;
  }

  @Override
  public String toString() {
    return "UR:::::" + parent + ":::::" + child + ":::::" + score;
  }

  public StringUnaryRule(String parent, String child, double score) {
    this.parent = parent;
    this.child = child;
    this.score = score;
  }
}

class StringBinaryRule {
  public String parent;
  public String leftChild;
  public String rightChild;
  public double score;

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (!(o instanceof StringBinaryRule)) {
      return false;
    }

    final StringBinaryRule stringBinaryRule = (StringBinaryRule) o;

    if (score != stringBinaryRule.score) {
      return false;
    }
    if (leftChild != null ? !leftChild.equals(stringBinaryRule.leftChild) : stringBinaryRule.leftChild != null) {
      return false;
    }
    if (parent != null ? !parent.equals(stringBinaryRule.parent) : stringBinaryRule.parent != null) {
      return false;
    }
    if (rightChild != null ? !rightChild.equals(stringBinaryRule.rightChild) : stringBinaryRule.rightChild != null) {
      return false;
    }

    return true;
  }

  @Override
  public int hashCode() {
    int result;
    long temp;
    result = (parent != null ? parent.hashCode() : 0);
    result = 29 * result + (leftChild != null ? leftChild.hashCode() : 0);
    result = 29 * result + (rightChild != null ? rightChild.hashCode() : 0);
    temp = Double.doubleToLongBits(score);
    result = 29 * result + (int) (temp ^ (temp >>> 32));
    return result;
  }

  @Override
  public String toString() {
    return "BR:::::" + parent + ":::::" + leftChild + ":::::" + rightChild + ":::::" + score;
  }

  public StringBinaryRule(String parent, String leftChild, String rightChild, double score) {
    this.parent = parent;
    this.leftChild = leftChild;
    this.rightChild = rightChild;
    this.score = score;
  }
}