/** * */ package edu.berkeley.nlp.PCFGLA; import java.util.HashMap; import edu.berkeley.nlp.PCFGLA.BerkeleyParser.Options; import edu.berkeley.nlp.util.ArrayUtil; import edu.berkeley.nlp.util.Numberer; /** * @author petrov * */ public class GrammarAnalyzer { public static class Options { @Option(name = "-in", required = true, usage = "Grammarfile") public String grFileName; @Option(name = "-t", usage = "Threshold for pruning unlikely rules") public double threshold = -1; } public static void main(String[] args) { OptionParser optParser = new OptionParser(Options.class); Options opts = (Options) optParser.parse(args, true); String inFileName = opts.grFileName; ParserData pData = ParserData.Load(inFileName); if (pData==null) { System.out.println("Failed to load grammar from file"+inFileName+"."); System.exit(1); } Grammar grammar = pData.getGrammar(); Lexicon lexicon = pData.getLexicon(); Numberer.setNumberers(pData.getNumbs()); if (opts.threshold > -1){ System.out.println("Remving rules with probability below "+opts.threshold+"."); grammar.splitRules(); grammar.removeUnlikelyRules(opts.threshold,1.0); lexicon.removeUnlikelyTags(opts.threshold,1.0); } int[] tagTotal = new int[grammar.numSubStates.length]; int[] tagOne = new int[grammar.numSubStates.length]; int[] gr = computeAndPrintCounts(grammar, tagTotal, tagOne); printStats("Grammar", gr); int[] lex = computeAndPrintCounts(lexicon, tagTotal, tagOne); printStats("Lexicon", lex); ArrayUtil.addInPlace(gr,lex); printStats("Together",gr); System.out.println("\nTag specific statistics (x=1):"); Numberer tagNumberer = Numberer.getGlobalNumberer("tags"); for (int i=0; i<tagTotal.length; i++){ System.out.println(tagNumberer.object(i)+": "+tagOne[i]+"/"+tagTotal[i]); } System.exit(0); } public static int[] computeAndPrintCounts(Grammar gr, int[] tagTotal, int[] tagOne){ int nulledOut=0, zero=0, belowOne=0, one=0, aboveOne=0, total=0; int unaryEqualSlices=0, binaryEqualSlices=0, unarySlices=0, binarySlices=0; int unaryEqual=0, binaryEqual=0, unary=0, binary=0; for (int state=0; state<gr.numStates; state++){ int nParentSubStates = gr.numSubStates[state]; for (UnaryRule uRule : gr.getUnaryRulesByParent(state)){ int nChildSubStates = gr.numSubStates[uRule.childState]; total += nChildSubStates * nParentSubStates; unary++; double[][] scores = uRule.getScores2(); // boolean allTheSame=true; // double veryFirst = scores[0][0]; for (int j=0; j<nChildSubStates; j++){ unarySlices++; if (scores[j]==null){ nulledOut += nParentSubStates; continue; } boolean sliceTheSame=true; double first = scores[j][0]; for (int i=0; i<nParentSubStates; i++){ double p = scores[j][i]; if (p==0) zero++; else if (p==1) one++; else if (p>1) aboveOne++; else belowOne++; if (sliceTheSame&&p!=first) sliceTheSame=false; // if (allTheSame&&p!=veryFirst) allTheSame=false; tagTotal[state]++; if (p==1) tagOne[state]++; } if (sliceTheSame) unaryEqualSlices++; } // if (allTheSame) unaryEqual++; } for (BinaryRule bRule : gr.splitRulesWithP(state)){//gr.getBinaryRulesByParent(state)){ double[][][] scores = bRule.getScores2(); binary++; // boolean allTheSame=true; // double veryFirst = scores[0][0][0]; for (int j=0; j<scores.length; j++){ for (int k=0; k<scores[j].length; k++){ total += nParentSubStates; binarySlices++; if (scores[j][k]==null){ nulledOut += nParentSubStates; continue; } boolean sliceTheSame=true; double first = scores[j][k][0]; for (int i=0; i<nParentSubStates; i++){ double p = scores[j][k][i]; if (p==0) zero++; else if (p==1) one++; else if (p>1) aboveOne++; else belowOne++; if (sliceTheSame&&p!=first) sliceTheSame=false; // if (allTheSame&&p!=veryFirst) allTheSame=false; tagTotal[state]++; if (p==1) tagOne[state]++; } if (sliceTheSame) binaryEqualSlices++; } } // if (allTheSame) binaryEqual++; } } System.out.println("Same across parent: "+unaryEqualSlices+"/"+unarySlices+" (unary), "+binaryEqualSlices+"/"+binarySlices+" (binary)"); System.out.println("All the same: "+unaryEqual+"/"+unary+" (unary), "+binaryEqual+"/"+binary+" (binary)"); return new int[]{nulledOut,zero, belowOne, one, aboveOne, total}; } public static int[] computeAndPrintCounts(Lexicon lexicon, int[] tagTotal, int[] tagOne){ int zero=0, belowOne=0, one=0, aboveOne=0, total=0; int equal=0, pairs=0; if (lexicon instanceof SophisticatedLexicon){ SophisticatedLexicon lex = (SophisticatedLexicon)lexicon; for (short tag=0; tag<lex.numSubStates.length; tag++){ HashMap<String, double[]> tagMap = lex.wordToTagCounters[tag]; if (tagMap==null) continue; for (String word : tagMap.keySet()) { double[] lexiconScores = lex.score(word,tag,0,false,false); for (int i = 0; i < lexiconScores.length; i++) { total++; double p = lexiconScores[i]; if (p==0) zero++; else if (p==1) one++; else if (p>1) aboveOne++; else belowOne++; tagTotal[tag]++; if (p==1) tagOne[tag]++; } } } } else { SimpleLexicon lex = (SimpleLexicon)lexicon; for (int tag=0; tag<lex.scores.length; tag++){ if (lex.tagWordIndexer[tag].size()==0) continue; for (int word=0; word<lex.scores[tag][0].length; word++){ boolean allTheSame=true; double first = lex.scores[tag][0][word]; pairs++; for (int substate=0; substate<lex.numSubStates[tag]; substate++){ total++; double p = lex.scores[tag][substate][word]; if (p==0) zero++; else if (p==1) one++; else if (p>1) aboveOne++; else belowOne++; if (allTheSame&&p!=first) allTheSame=false; tagTotal[tag]++; if (p==1) tagOne[tag]++; } if (allTheSame) equal++; } } System.out.println("Same across parent: "+equal+"/"+pairs); } return new int[]{0,zero, belowOne, one, aboveOne, total}; } public static void printStats(String title, int[] vals){ int nulledOut=vals[0], zero=vals[1], belowOne=vals[2], one=vals[3], aboveOne=vals[4], total=vals[5]; System.out.println(title+" statistics:"); System.out.println("Total rules:\t"+total); System.out.println("Total non-zero:\t"+(belowOne+one+aboveOne+"\n")); System.out.println("Nulled out:\t"+nulledOut); System.out.println("x = 0:\t\t"+zero); System.out.println("0 < x < 1:\t"+belowOne); System.out.println("x = 1:\t\t"+one); System.out.println("x > 1:\t\t"+aboveOne+"\n"); } }