package edu.berkeley.nlp.PCFGLA;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import edu.berkeley.nlp.PCFGLA.ConditionalTrainer.Options;
import edu.berkeley.nlp.PCFGLA.Corpus.TreeBankType;
import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentBits;
import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentSubstate;
import edu.berkeley.nlp.discPCFG.HiearchicalAdaptiveLinearizer;
import edu.berkeley.nlp.discPCFG.Linearizer;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.math.DoubleArrays;
import edu.berkeley.nlp.math.SloppyMath;
import edu.berkeley.nlp.util.*;
class FullState {
public short state;
public short substate;
/** A hack to make getting P(parent|child) easier.*/
public double score;
public FullState(short state, short substate) {
this.state = state;
this.substate = substate;
}
/**
* @param tagNumberer
* @return
*/
public String toString(Numberer tagNumberer) {
String w;
String name = tagNumberer.object(state)+"-"+substate;
w = "<a href="+GrammarStatistics.reflabel("productions",name)+">"+name+"</a> ";
return w;
}
/**
* @param tagNumberer
* @return
*/
public String toString(Numberer tagNumberer, String childFullName) {
String w;
String name = tagNumberer.object(state)+"-"+substate;
w = "<a href="+GrammarStatistics.reflabel("parentrules",childFullName+"*under*"+name)+">"+name+"</a> ";
return w;
}
public boolean equals(FullState s) {
return (state==s.state && substate==s.substate);
}
}
class SearchState {
public ArrayList<FullState> produced = new ArrayList<FullState>();
public FullState danglingState;
public double score;
public int insertPosition = 0;
FullState parent = null;
public boolean extended = false;
public SearchState (FullState danglingState, double score) {
this.danglingState = danglingState;
this.score = score;
}
public SearchState (FullState danglingState, FullState firstProduction, double score) {
this.danglingState = danglingState;
produced.add(firstProduction);
this.score = score;
}
public SearchState extend (FullState newProd, FullState newDangling, double scorePenalty, boolean left) {
SearchState s = new SearchState(newDangling,score + scorePenalty);
s.produced = new ArrayList<FullState>(produced);
s.produced.add(insertPosition,newProd);
s.insertPosition = insertPosition + (left ? 0 : 1);
return s;
}
public String toString(Numberer tagNumberer) {
String w="";
if (parent!=null) {
String name = tagNumberer.object(parent.state)+"-"+parent.substate;
w += "<a href="+GrammarStatistics.reflabel("productions",name)+">"+name+"</a> -> ";
}
for (FullState s : produced) {
String name = tagNumberer.object(s.state)+"-"+s.substate;
w += "<a href="+GrammarStatistics.reflabel("productions",name)+">"+name+"</a> ";
}
return w;
}
/**
* @param rs
* @param ps
* @param rscore
* @param b
* @return
*/
public SearchState extendUp(FullState cs, FullState ps, double rscore, boolean thisChildOnLeft) {
SearchState s = new SearchState(ps,score + rscore);
s.produced = new ArrayList<FullState>(produced);
if (cs!=null) {
if (thisChildOnLeft)
s.produced.add(0,cs);
else
s.produced.add(produced.size(),cs);
}
s.extended = true;
return s;
}
}
public class GrammarStatistics {
private static int topN = 10;
public GrammarStatistics (Grammar grammar, Numberer tagNumberer, int nScores) {
this.grammar = grammar;
this.tagNumberer = tagNumberer;
this.nScores = nScores;
}
public Grammar grammar;
public Numberer tagNumberer;
public int nScores;
/** Find the best nScores productions by doing breadth-first search.
*
* @param p
* @param nScores
* @return
*/
PriorityQueue<SearchState> getTopProductions(FullState p) {
PriorityQueue<SearchState> results = new PriorityQueue<SearchState>(nScores+1);
PriorityQueue<SearchState> unExpanded = new PriorityQueue<SearchState>();
unExpanded.add(new SearchState(p,0),0);
while ( unExpanded.size()!=0 && (results.size()<nScores || unExpanded.peek().score > -results.peek().score) ) {
//expand best-looking SearchState so far
SearchState state = unExpanded.next();
//accept complete productions
if (state.danglingState==null || (state.produced.size()!=0 && !continues(state.danglingState.state))) {
if (state.danglingState!=null)
state = state.extend(state.danglingState,null,0,false);
results.add(state,-state.score);
if (results.size()>nScores)
results.next();
}
//try to complete partial productions
else {
for (UnaryRule rule: grammar.getUnaryRulesByParent(state.danglingState.state)) {
double[][] scores = rule.getScores2();
for (short cSubState = 0; cSubState < grammar.numSubStates[rule.getChildState()]; cSubState++) {
if (scores[cSubState]==null) continue;
double rscore = scores[cSubState][state.danglingState.substate];
FullState s = new FullState(rule.getChildState(),cSubState);
SearchState newState = state.extend(s,null,rscore,false);
unExpanded.add(newState,newState.score);
}
}
for (BinaryRule rule : grammar.splitRulesWithP(state.danglingState.state)){//getBinaryRulesByParent(state.danglingState.state)) {
double[][][] scores = rule.getScores2();
for (short lSubState = 0; lSubState < grammar.numSubStates[rule.getLeftChildState()]; lSubState++) {
FullState ls = new FullState(rule.getLeftChildState(),lSubState);
for (short rSubState = 0; rSubState < grammar.numSubStates[rule.getRightChildState()]; rSubState++) {
if (scores[lSubState][rSubState]==null) continue;
FullState rs = new FullState(rule.getRightChildState(),rSubState);
SearchState newState;
double rscore = scores[lSubState][rSubState][state.danglingState.substate];
if (continues(ls.state)) {
newState = state.extend(rs,ls,rscore,true);
} else {
newState = state.extend(ls,rs,rscore,false);
}
unExpanded.add(newState,newState.score);
}
}
}
}
}
return results;
}
/** Find the best nScores productions by doing breadth-first search.
*
* @param p
* @param nScores
* @return
*/
PriorityQueue<SearchState> getTopParentRuleProductions(FullState c,
double[] probState, double[][] probSubGivenState) {
PriorityQueue<SearchState> results = new PriorityQueue<SearchState>(nScores+1);
PriorityQueue<SearchState> unExpanded = new PriorityQueue<SearchState>();
double score = -(probState[c.state]+probSubGivenState[c.state][c.substate]);
unExpanded.add(new SearchState(c,c,score),-score);
int maxSize = 10000;
while (unExpanded.size() != 0
&& unExpanded.size() < maxSize
&& (results.size() < nScores || unExpanded.peek().score > -results
.peek().score)) {
//expand best-looking SearchState so far
SearchState state = unExpanded.next();
//accept complete productions
if (state.danglingState==null || (state.extended && !continues(state.danglingState.state))) {
if (state.danglingState!=null)
state.parent = state.danglingState;
state.score += probState[state.parent.state]
+ probSubGivenState[state.parent.state][state.parent.substate];
results.add(state,-state.score);
if (results.size()>nScores)
results.next();
}
//try to complete partial productions
else {
for (UnaryRule rule: grammar.getUnaryRulesByChild(state.danglingState.state)) {
double[][] scores = rule.getScores2();
if (scores[state.danglingState.substate]==null) continue;
for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
double rscore = scores[state.danglingState.substate][pSubState];
FullState s = new FullState(rule.getParentState(),pSubState);
SearchState newState = state.extendUp(null,s,rscore,false);
unExpanded.add(newState,newState.score);
}
}
for (BinaryRule rule : grammar.splitRulesWithLC(state.danglingState.state)){//getBinaryRulesByLeftChild(state.danglingState.state)) {
double[][][] scores = rule.getScores2();
for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
FullState ps = new FullState(rule.getParentState(),pSubState);
for (short rSubState = 0; rSubState < grammar.numSubStates[rule.getRightChildState()]; rSubState++) {
if (scores[state.danglingState.substate][rSubState]==null) continue;
FullState rs = new FullState(rule.getRightChildState(),rSubState);
SearchState newState;
double rscore = scores[state.danglingState.substate][rSubState][pSubState];
newState = state.extendUp(rs,ps,rscore,false);
unExpanded.add(newState,newState.score);
}
}
}
for (BinaryRule rule : grammar.splitRulesWithRC(state.danglingState.state)){//getBinaryRulesByRightChild(state.danglingState.state)) {
double[][][] scores = rule.getScores2();
for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
FullState ps = new FullState(rule.getParentState(),pSubState);
for (short lSubState = 0; lSubState < grammar.numSubStates[rule.getLeftChildState()]; lSubState++) {
if (scores[lSubState][state.danglingState.substate]==null) continue;
FullState rs = new FullState(rule.getLeftChildState(),lSubState);
SearchState newState;
double rscore = scores[lSubState][state.danglingState.substate][pSubState];
newState = state.extendUp(rs,ps,rscore,true);
unExpanded.add(newState,newState.score);
}
}
}
}
}
return results;
}
public boolean continues(short state) {
return ((String)tagNumberer.object(state)).charAt(0)=='@';
}
public static String pad(String s, int width, char c) {
StringBuffer sb = new StringBuffer(s);
for (int i=s.length(); i<width; i++)
sb.append(c);
return sb.toString();
}
static NumberFormat f = NumberFormat.getInstance();
public static class Options {
@Option(name = "-in", usage = "Input File for Grammar")
public String in;
@Option(name = "-out", usage = "Output File")
public String out;
@Option(name = "-path", usage = "Path to Corpus")
public String path = null;
@Option(name = "-treebank", usage = "Language: WSJ, CHNINESE, GERMAN, CONLL, SINGLEFILE (Default: ENGLISH)")
public TreeBankType treebank = TreeBankType.WSJ;
@Option(name = "-unkT", usage = "Unknown word threshold")
public int unkT = 1;
@Option(name = "-maxL", usage = "Maximum sentence length")
public int maxL = 40;
}
@SuppressWarnings("unchecked")
public static void main(String[] args) {
OptionParser optParser = new OptionParser(Options.class);
Options opts = (Options) optParser.parse(args, false);
// provide feedback on command-line arguments
System.out.println("Calling GrammarStatistics with " + optParser.getPassedInOptions());
f.setMaximumFractionDigits(5);
System.out.println("<html><body>");
System.out.println("<h1>Links</h1><ul>");
System.out.println("<li><a href=\"#lexicon\">Lexicon</a></li>");
System.out.println("<li><a href=\"#grammar\">Grammar</a></li>");
System.out.println("<li><a href=\"#trunks\">Trunks</a></li>");
System.out.println("<li><a href=\"#parents\">Parents</a></li>");
System.out.println("<li><a href=\"#parentrules\">Parent Rules</a></li>");
System.out.println("</ul>");
System.out.println("<!--");
String inFileName = opts.in;
String outName = opts.out;
System.out.println("Loading grammar from " + inFileName + ".");
String wsjLoc = opts.path;
boolean columnOutput = true;
ParserData pData = ParserData.Load(inFileName);
if (pData == null) {
System.out.println("Failed to load grammar from file" + inFileName + ".");
System.exit(1);
}
Grammar grammar = pData.getGrammar();
Lexicon lexicon = pData.getLexicon();
Numberer.setNumberers(pData.getNumbs());
Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
grammar.splitRules();
// if (thresh>0){
// grammar.removeUnlikelyRules(thresh,1.0);
// lexicon.removeUnlikelyTags(thresh,1.0);
// }
pData.Save(outName+".gr");
System.out.println("Writing grammar to file grammar.data...");
Writer output = null;
try {
output = new BufferedWriter(new FileWriter(outName+".grammar"));
//output.write(grammar.toString());
grammar.writeData(output);
if (output != null) output.close();
output = new BufferedWriter(new FileWriter(outName+".lexicon"));
output.write(lexicon.toString());
if (output != null) output.close();
} catch (IOException ex) { ex.printStackTrace();}
//put grammar and lexicon in logarithm mode so that we can
//use our old code below
pData = ParserData.Load(inFileName);
if (pData == null) {
System.out.println("Failed to load grammar from file" + inFileName + ".");
System.exit(1);
}
grammar = pData.getGrammar();
grammar.splitRules();
lexicon = pData.getLexicon();
// grammar.logarithmMode();
// lexicon.logarithmMode();
// computeAndPrintCounts(grammar);
//reload the grammar and lexicon because the ones we have now are in
//logarithm mode, and we can't do inside/outside scores like that
ParserData pDataNoLog = ParserData.Load(inFileName);
if (pDataNoLog == null) {
System.out.println("Failed to load grammar from file" + inFileName + ".");
System.exit(1);
}
Grammar nonLogGrammar = pDataNoLog.getGrammar();
Lexicon nonLogLexicon = pDataNoLog.getLexicon();
SpanPredictor spanPredictor = pDataNoLog.getSpanPredictor();
ArrayParser parser = new ArrayParser(nonLogGrammar,nonLogLexicon);
// computeAndPrintCounts(grammar);
System.out.println("-->");
Corpus corpus = new Corpus(wsjLoc,opts.treebank,1.0,false);
List<Tree<String>> trainTrees = Corpus.binarizeAndFilterTrees(corpus
.getTrainTrees(), pData.getV_markov(), pData.getH_markov(),
opts.maxL, pData.getBinarization(), false, false);
trainTrees = Corpus.filterTreesForConditional(trainTrees, false,false,false);
StateSetTreeList trainStateSetTrees = new StateSetTreeList(trainTrees, nonLogGrammar.numSubStates, false, tagNumberer);
int padding = 3;
topN = 30;
printLexiconStatistics(lexicon, tagNumberer,grammar.isGrammarTag,grammar, trainStateSetTrees, opts);
GrammarStatistics gs = new GrammarStatistics(grammar,tagNumberer, topN);
// determine which tags need to be examined.
// Continuation tags and lexical tags are excluded
Set<Short> noContinueTags = new HashSet<Short>();
Set<Short> continueTags = new HashSet<Short>();
for (short i=0; i<tagNumberer.total(); i++) {
if (!grammar.isGrammarTag[i]) continue;
if (!gs.continues(i))
noContinueTags.add(i);
else
continueTags.add(i);
}
printGrammarStatistics(columnOutput, pData, tagNumberer, topN, gs, noContinueTags);
printTrunkStatistics(columnOutput, tagNumberer, padding, topN, gs, continueTags);
System.out.println("<!--");
System.out.println("-->");
Set<Short> allRealTags = new HashSet<Short>(noContinueTags);
for (short i=0; i<grammar.numSubStates.length; i++){
if (!grammar.isGrammarTag[i]) allRealTags.add(i);
}
double[] probState = new double[grammar.numStates];
double[][] probSubGivenState = new double[grammar.numStates][];
for (int state=0; state<grammar.numStates; state++) {
probSubGivenState[state] = new double[grammar.numSubStates[state]];
}
for (Tree<StateSet> tree : trainStateSetTrees) {
// System.out.println("adding probs for tree "+nTree+" / "+trainStateSetTrees.size());
parser.doInsideOutsideScores(tree,false,true);
tallyProbState(tree,probState,allRealTags);
tallyProbSubState(tree,probSubGivenState,allRealTags);
}
for (int state=0; state<grammar.numStates; state++) {
double sum = 0;
for (int substate=0; substate<grammar.numSubStates[state]; substate++) {
sum += probSubGivenState[state][substate];
}
for (int substate=0; substate<grammar.numSubStates[state]; substate++) {
probSubGivenState[state][substate] = Math.log(probSubGivenState[state][substate] / sum);
}
}
double sumState = 0;
for (int state=0; state<grammar.numStates; state++) {
sumState += probState[state];
}
for (int state=0; state<grammar.numStates; state++) {
probState[state] = Math.log(probState[state] / sumState);
}
printParentRuleStatistics(columnOutput,pData,tagNumberer,topN,gs,allRealTags,probState,probSubGivenState);
printParentStatistics(columnOutput, grammar, tagNumberer, nonLogGrammar, nonLogLexicon, topN, gs, trainTrees, parser);
System.out.println("</body></html>");
}
private static void tallyProbSubState(Tree<StateSet> tree,
double[][] probSubGivenState, Set<Short> noContinueTags) {
tallyProbSubStateHelper(tree,tree.getLabel().getIScore(0),
probSubGivenState,noContinueTags);
}
/**
* @param tree
* @param probSubGivenState
*/
private static void tallyProbSubStateHelper(Tree<StateSet> tree,
double treeProb, double[][] probSubGivenState,
Set<Short> tags) {
if (tree.isLeaf())
return;
StateSet label = tree.getLabel();
short state = label.getState();
if (tags.contains(state)) {
double[] iScores = label.getIScores();
double[] oScores = label.getOScores();
double[] scores = new double[iScores.length];
double sum = 0;
for (int substate=0; substate<iScores.length; substate++) {
scores[substate] = (iScores[substate] / treeProb) * oScores[substate];
sum += scores[substate];
}
for (int substate=0; substate<iScores.length; substate++) {
scores[substate] /= sum;
probSubGivenState[state][substate] += scores[substate];
}
}
for (Tree<StateSet> child : tree.getChildren())
tallyProbSubStateHelper(child,treeProb,probSubGivenState,tags);
}
/**
* Count occurrences of each state. Ignore states that start with "@".
*
* @param tree
* @param probState
*/
private static void tallyProbState(Tree<StateSet> tree, double[] probState, Set<Short> tags) {
if (tree.isLeaf())
return;
short state = tree.getLabel().getState();
if (tags.contains(state))
probState[state] += 1;
for (Tree<StateSet> child : tree.getChildren())
tallyProbState(child,probState,tags);
}
/**
* @param columnOutput
* @param grammar
* @param tagNumberer
* @param nonLogGrammar
* @param nonLogLexicon
* @param topN
* @param gs
* @param trainTrees
*/
private static FullState[][] printParentStatistics(boolean columnOutput, Grammar grammar, Numberer tagNumberer, Grammar nonLogGrammar, Lexicon nonLogLexicon, int topN, GrammarStatistics gs, List<Tree<String>> trainTrees, ArrayParser parser) {
System.out.println("<a name=\"parents\"><h1>Parents</h1></a>");
System.out.println("<!--");
int nstates = grammar.numStates;
double[][][][] parentProbs = new double[nstates][nstates][][];
double[][] normFactors = new double[nstates][];
FullState[][] parents = new FullState[grammar.numStates][];
for (int state=0; state<nstates; state++) {
normFactors[state] = new double[grammar.numSubStates[state]];
}
StateSetTreeList trainStateSetTrees = new StateSetTreeList(trainTrees,
grammar.numSubStates, false, tagNumberer);
/*CorpusStatistics corpusStats = new CorpusStatistics(tagNumberer, trainStateSetTrees);
corpusStats.countSymbols();
int counts[] = corpusStats.getSymbolCounts();
corpusStats.printStateCountArray(tagNumberer,counts);*/
int nTree = 0;
System.out.print("Adding probabilities");
for (Tree<StateSet> tree : trainStateSetTrees) {
// System.out.println("adding probs for tree "+nTree+" / "+trainStateSetTrees.size());
parser.doInsideOutsideScores(tree,false,true);
logarithmModeTree(tree);
gs.addProbs(tree, grammar, parentProbs, normFactors, tree.getLabel()
.getIScore(0));
if (nTree++%1000==0) System.out.print(".");
}
System.out.print("done.\n");
System.out.println("-->");
for (short childState=0; childState<nstates; childState++) {
String[][] outputMatrix = new String[topN+1][grammar.numSubStates[childState]];
String tagName = (String) tagNumberer.object(childState);
for (short cS=0; cS<grammar.numSubStates[childState]; cS++) {
String childFullName = outputMatrix[0][cS] = tagName + "-" + cS;
PriorityQueue<FullState> results = new PriorityQueue<FullState>(topN+1);
for (short parentState=0; parentState<nstates; parentState++) {
double[][] probs = parentProbs[parentState][childState];
if (probs==null)
continue;
double normFactor = normFactors[childState][cS];
for (short pS=0; pS<grammar.numSubStates[parentState]; pS++) {
//find max rules
double score = probs[pS][cS] / normFactor;
if (!results.isEmpty() && score < -results.getPriority())
continue;
FullState state = new FullState(parentState,pS);
state.score = score;
results.add(state,-state.score);
if (results.size()>topN)
results.next();
}
}
ArrayList<FullState> resultsA = new ArrayList<FullState>(topN);
while (results.size()!=0) {
resultsA.add(0,results.next());
}
parents[childState] = new FullState[resultsA.size()];
for (short j = 0; j < topN; j++){
String o="";
double p=-1;
if (resultsA.size()>j) {
parents[childState][j] = resultsA.get(j);
p = resultsA.get(j).score;
String w = resultsA.get(j).toString(tagNumberer,childFullName);
o = f.format(p)+" "+w;
}
outputMatrix[j+1][cS] = o;
}
}
printRules("Parent", "parent", columnOutput, outputMatrix);
}
return parents;
}
/**
* @param columnOutput
* @param tagNumberer
* @param padding
* @param topN
* @param gs
* @param continueTags
*/
private static void printTrunkStatistics(boolean columnOutput, Numberer tagNumberer, int padding, int topN, GrammarStatistics gs, Set<Short> continueTags) {
System.out.println("<a name=\"trunks\"><h1>Trunks</h1></a>");
//output trunk rule probabilities
for (short tag : continueTags) {
String tagS = ((String)tagNumberer.object(tag)).substring(1);
short parentTag = (short)tagNumberer.number(tagS);
gs.printTopRules(parentTag, topN, columnOutput, padding);
gs.printTopRules(tag, topN, columnOutput, padding);
System.out.println("");
}
}
/**
* @param columnOutput
* @param pData
* @param tagNumberer
* @param topN
* @param gs
* @param noContinueTags
*/
private static void printGrammarStatistics(boolean columnOutput, ParserData pData, Numberer tagNumberer, int topN, GrammarStatistics gs, Set<Short> noContinueTags) {
System.out.println("<a name=\"grammar\"><h1>Grammar</h1></a>");
System.out.println("<div id=\"grammar\">");
// print rule probabilities
for (short curTag : noContinueTags){
int nSubStates = pData.numSubStatesArray[curTag];
ArrayList<SearchState>[] results = new ArrayList[nSubStates];
for (short i = 0; i < nSubStates; i++) {
//do heavy computation
PriorityQueue<SearchState> pq = gs.getTopProductions(new FullState(curTag,i));
//convert pq to array
results[i] = new ArrayList<SearchState>(topN);
while (pq.size()!=0) {
pq.peek().score = Math.exp(pq.peek().score);
results[i].add(0,pq.next());
}
}
String[][] outputMatrix = new String[topN+1][nSubStates];
String tagName = (String) tagNumberer.object(curTag);
for (int i = 0; i < nSubStates; i++) {
outputMatrix[0][i] = tagName + "-" + i;
}
for (int j = 0; j < topN; j++){
for (int i = 0; i < nSubStates; i++) {
String o="";
double p=-1;
if (results[i].size()>j) {
p = results[i].get(j).score;
String w = results[i].get(j).toString(tagNumberer);
o = f.format(p)+" "+w;
}
outputMatrix[j+1][i] = o;
}
}
printRules("Grammar","productions", columnOutput, outputMatrix);
}
System.out.println("</div>");
}
/**
* @param columnOutput
* @param pData
* @param tagNumberer
* @param topN
* @param gs
* @param noContinueTags
*/
private static void printParentRuleStatistics(boolean columnOutput, ParserData pData, Numberer tagNumberer, int topN, GrammarStatistics gs, Set<Short> noContinueTags,
double[] probState, double[][] probSubGivenState) {
System.out.println("<a name=\"parentrules\"><h1>Parent Rules</h1></a>");
// print rule probabilities
for (short curTag : noContinueTags){
int nSubStates = pData.numSubStatesArray[curTag];
ArrayList<SearchState>[] results = new ArrayList[nSubStates];
for (short i = 0; i < nSubStates; i++) {
//do heavy computation
PriorityQueue<SearchState> pq = gs.getTopParentRuleProductions(new FullState(curTag,i),probState,probSubGivenState);
//convert pq to array
results[i] = new ArrayList<SearchState>(topN);
while (pq.size()!=0) {
pq.peek().score = Math.exp(pq.peek().score);
results[i].add(0,pq.next());
}
}
String[][] outputMatrix = new String[topN+1][nSubStates];
String tagName = (String) tagNumberer.object(curTag);
for (int i = 0; i < nSubStates; i++) {
outputMatrix[0][i] = tagName + "-" + i;
}
for (int j = 0; j < topN; j++){
for (int i = 0; i < nSubStates; i++) {
String o="";
double p=-1;
if (results[i].size()>j) {
p = results[i].get(j).score;
String w = results[i].get(j).toString(tagNumberer);
o = f.format(p)+" "+w;
}
outputMatrix[j+1][i] = o;
}
}
printRules("Parent Rules","parentrules", columnOutput, outputMatrix);
}
}
/**
* @param tree
*/
private static void logarithmModeTree(Tree<StateSet> tree) {
if (tree.isLeaf())
return;
double[] iScores = tree.getLabel().getIScores();
int iScale = tree.getLabel().getIScale();
double[] oScores = tree.getLabel().getOScores();
int oScale = tree.getLabel().getOScale();
for (int i=0; i<iScores.length; i++) {
iScores[i] = Math.log(iScores[i]) + 100*iScale;
oScores[i] = Math.log(oScores[i]) + 100*oScale;
}
tree.getLabel().setIScores(iScores);
tree.getLabel().setOScores(oScores);
for (Tree child : tree.getChildren()) {
logarithmModeTree(child);
}
}
/**
* @param tree
* @param g
* @param parentProbs indexed by parent, child, psub, csub
*/
private void addProbs(Tree<StateSet> tree, Grammar g,
double[][][][] parentProbs, double[][] normFactors,
double treeScore) {
int nSubStates = tree.getLabel().numSubStates();
double[][] viterbiProbs = new double[nSubStates][nSubStates];
for (int i=0; i<viterbiProbs.length; i++) {
for (int j=0; j<viterbiProbs[i].length; j++) {
if (i!=j) {
viterbiProbs[i][j] = Double.NEGATIVE_INFINITY;
} else {
viterbiProbs[i][j] = tree.getLabel().getOScore(i)
- treeScore;
}
}
}
addProbsHelper(tree.getLabel().getState(),tree,g,parentProbs,normFactors,viterbiProbs,treeScore);
}
/**
* @param tree
* @param g
* @param parentProbs
* @param viterbiProbs
*/
private void addProbsHelper(short gpState, Tree<StateSet> tree, Grammar g,
double[][][][] parentProbs, double[][] normFactor, double[][] viterbiProbs, double treeScore) {
if (tree.isPreTerminal() || tree.isLeaf())
return;
short pState = tree.getLabel().getState();
int nParentStates = tree.getLabel().numSubStates();
List<Tree<StateSet>> children = tree.getChildren();
switch(children.size()) {
case 1:
Tree<StateSet> child = children.get(0);
short cState = child.getLabel().getState();
double[][] scores = g.getUnaryScore(pState,cState);
int nChildStates = child.getLabel().numSubStates();
double[][] newViterbiProbs = new double[viterbiProbs.length][nChildStates];
for (int gpS=0; gpS<viterbiProbs.length; gpS++) {
for (int cS=0; cS<nChildStates; cS++) {
if (scores[cS]==null)
continue;
double[] scoresToSum = new double[nParentStates];
for (int pS=0; pS<nParentStates; pS++) {
scoresToSum[pS] = viterbiProbs[gpS][pS] + scores[cS][pS];
}
newViterbiProbs[gpS][cS] = SloppyMath.logAdd(scoresToSum);
}
}
if (continues(cState)) {
addProbsHelper(gpState,child,g,parentProbs,normFactor,newViterbiProbs, treeScore);
} else {
addProbsFinal(child,gpState,cState,parentProbs,normFactor,newViterbiProbs);
addProbs(child,g,parentProbs,normFactor,treeScore);
}
break;
case 2:
Tree<StateSet> lChild = children.get(0);
Tree<StateSet> rChild = children.get(1);
short lcState = lChild.getLabel().getState();
short rcState = rChild.getLabel().getState();
double[][][] scoresB = g.getBinaryScore(pState,lcState,rcState);
int nLChildStates = lChild.getLabel().numSubStates();
int nRChildStates = rChild.getLabel().numSubStates();
double[][] newLViterbiProbs = new double[viterbiProbs.length][nLChildStates];
double[][] newRViterbiProbs = new double[viterbiProbs.length][nRChildStates];
for (int gpS=0; gpS<viterbiProbs.length; gpS++) {
double[][] lScoresToSum = new double[nLChildStates][nParentStates*nRChildStates];
double[][] rScoresToSum = new double[nRChildStates][nParentStates*nLChildStates];
for (int lcS=0; lcS<nLChildStates; lcS++) {
for (int rcS=0; rcS<nRChildStates; rcS++) {
if (scoresB[lcS][rcS]==null)
continue;
for (int pS=0; pS<nParentStates; pS++) {
double vp = viterbiProbs[gpS][pS];
double sc = scoresB[lcS][rcS][pS];
lScoresToSum[lcS][pS * nRChildStates + rcS] = vp
+ sc + rChild.getLabel().getIScore(rcS);
rScoresToSum[rcS][pS * nLChildStates + lcS] = vp
+ sc + lChild.getLabel().getIScore(lcS);
}
}
}
for (int lcS=0; lcS<nLChildStates; lcS++) {
newLViterbiProbs[gpS][lcS] = SloppyMath.logAdd(lScoresToSum[lcS]);
}
for (int rcS=0; rcS<nRChildStates; rcS++) {
newRViterbiProbs[gpS][rcS] = SloppyMath.logAdd(rScoresToSum[rcS]);
}
}
if (continues(lcState)) {
addProbsHelper(gpState,lChild,g,parentProbs,normFactor,newLViterbiProbs, treeScore);
} else {
addProbsFinal(lChild,gpState,lcState,parentProbs,normFactor,newLViterbiProbs);
addProbs(lChild,g,parentProbs,normFactor,treeScore);
}
if (continues(rcState)) {
addProbsHelper(gpState,rChild,g,parentProbs,normFactor,newRViterbiProbs,treeScore);
} else {
addProbsFinal(rChild,gpState,rcState,parentProbs,normFactor,newRViterbiProbs);
addProbs(rChild,g,parentProbs,normFactor,treeScore);
}
break;
}
}
/**
* @param gpState
* @param state
* @param parentProbs
* @param newViterbiProbs
*/
private void addProbsFinal(Tree<StateSet> child, short gpState, short cState,
double[][][][] parentProbs, double[][] normFactor, double[][] viterbiProbs) {
for (int gpS=0; gpS<viterbiProbs.length; gpS++) {
for (int cS=0; cS<viterbiProbs[gpS].length; cS++) {
viterbiProbs[gpS][cS] = Math.exp(viterbiProbs[gpS][cS]
+ child.getLabel().getIScore(cS));
}
}
if (parentProbs[gpState][cState]==null) {
parentProbs[gpState][cState] = new double[viterbiProbs.length][viterbiProbs[0].length];
}
double[][] parentProbsCC = parentProbs[gpState][cState];
for (int gpS=0; gpS<viterbiProbs.length; gpS++) {
for (int cS=0; cS<viterbiProbs[gpS].length; cS++) {
parentProbsCC[gpS][cS] += viterbiProbs[gpS][cS];
normFactor[cState][cS] += viterbiProbs[gpS][cS];
}
}
}
static class RuleStruct {
public Rule r;
public double score;
public int pS;
public int lS;
public int rS;
boolean binary;
public RuleStruct(Rule r, double score, int pS, int lS, int rS) {
this.r = r;
this.score = score;
this.pS = pS;
this.lS = lS;
this.rS = rS;
this.binary = true;
}
public RuleStruct(Rule r, double score, int pS, int lS) {
this.r = r;
this.score = score;
this.pS = pS;
this.lS = lS;
this.rS = -1;
this.binary = false;
}
}
/**
* Print the top topN rules starting from symbol tag.
*
* @param tag
*/
private void printTopRules(short tag, int topN, boolean columnOutput, int padding) {
String[][] outputMatrix = new String[topN+1][grammar.numSubStates[tag]];
for (int i=0; i<outputMatrix.length; i++) {
for (int j=0; j<outputMatrix[i].length; j++) {
outputMatrix[i][j] = "";
}
}
for (int subState = 0; subState < grammar.numSubStates[tag]; subState++) {
outputMatrix[0][subState] = (String) tagNumberer.object(tag) + "-" + subState;
//hold top rules in reverse score order
PriorityQueue<RuleStruct> topRules = new PriorityQueue<RuleStruct>();
for (BinaryRule r : grammar.splitRulesWithP(tag)){//getBinaryRulesByParent(tag)) {
for (int lSubState = 0; lSubState < grammar.numSubStates[r.getLeftChildState()]; lSubState++) {
for (int rSubState = 0; rSubState < grammar.numSubStates[r.getRightChildState()]; rSubState++) {
double score = r.getScore(subState,lSubState,rSubState);
topRules.add(new RuleStruct(r,score,subState,lSubState,rSubState),-score);
if (topRules.size() > topN)
//remove worst rule
topRules.next();
}
}
}
for (UnaryRule r : grammar.getUnaryRulesByParent(tag)) {
for (int cSubState = 0; cSubState < grammar.numSubStates[r.getChildState()]; cSubState++) {
double score = r.getScore(subState,cSubState);
topRules.add(new RuleStruct(r,score,subState,cSubState),-score);
if (topRules.size() > topN)
//remove worst rule
topRules.next();
}
}
ArrayList<RuleStruct> r = new ArrayList<RuleStruct>();
while (topRules.hasNext()) {
RuleStruct s = topRules.next();
r.add(0,s);
}
for (int i=0; i<r.size(); i++){
outputMatrix[i+1][subState] = ruleToString(r.get(i));
}
}
String tagName = (String)tagNumberer.object(tag);
printRules("Trunk","topShortRules",columnOutput,outputMatrix);
}
public String ruleToString(RuleStruct r) {
StringBuffer sB = new StringBuffer();
sB.append(f.format(Math.exp(r.score)) + " ");
if (r.binary) {
BinaryRule b = (BinaryRule)r.r;
String leftName = tagNumberer.object(b.leftChildState)+"-"+r.lS;
String rightName = tagNumberer.object(b.rightChildState)+"-"+r.rS;
sB.append("<a href="+reflabel("productions",leftName)+">"+leftName+"</a> ");
sB.append("<a href="+reflabel("productions",rightName)+">"+rightName+"</a> ");
} else {
UnaryRule u = (UnaryRule)r.r;
String childName = tagNumberer.object(u.childState)+"-"+r.lS;
sB.append("<a href="+reflabel("productions",childName)+">"+childName+"</a> ");
}
return sB.toString();
}
/**
* @param columnOutput
* @param padding
* @param outputMatrix
*/
private static void printRules(String typeName, String ruleTypeName,
boolean columnOutput, String[][] outputMatrix) {
System.out.println("<h3>"+typeName+"</h3><table border=\"1\">");
if (columnOutput) {
for (int i = 0; i < outputMatrix.length; i++){
System.out.println("<tr>");
for (int j = 0; j < outputMatrix[0].length; j++) {
if (i==0) {
System.out.println("<th><a name="+label(ruleTypeName,outputMatrix[i][j])+"> <a href="+
parentRefLabel(outputMatrix[i][j])+">");
System.out.print(outputMatrix[i][j]);
System.out.println("</a></a> (<a href="+label("parent",outputMatrix[i][j])+">p</a>)</th>");
} else
System.out.print("<td>"+sanitize(outputMatrix[i][j])+"</td>");
}
System.out.println("</tr>");
}
} else {
for (int j = 0; j < outputMatrix[0].length; j++) {
System.out.println("<tr>");
for (int i = 0; i < outputMatrix.length; i++){
if (j==0) {
System.out.println("<th><a name="+label(ruleTypeName,outputMatrix[i][j])+"> <a href="+
parentRefLabel(outputMatrix[i][j])+">");
System.out.print(outputMatrix[i][j]);
System.out.println("</a></a></th>");
} else
System.out.print("<td>"+sanitize(outputMatrix[i][j])+"</td>");
}
System.out.println("</tr>");
}
}
System.out.println("</table><br/>");
}
public static int maxWidthInRow(String[][] m,int row) {
int l=0;
for (int c=0; c<m[row].length; c++) {
l = Math.max(l,m[row][c].length());
}
return l;
}
public static int maxWidthInCol(String[][] m,int col) {
int l=0;
for (int r=0; r<m.length; r++) {
l = Math.max(l,m[r][col].length());
}
return l;
}
public static void computeAndPrintCounts(Grammar gr){
int nUnaries=0, nBinaries=0;
int totalU=0, totalB=0;
int notInfU=0, notInfB=0;
int nulledOutU=0, nulledOutB=0, notNulledOutU=0, notNulledOutB=0;
for (int state=0; state<gr.numStates; state++){
int nParentSubStates = gr.numSubStates[state];
for (UnaryRule uRule : gr.getUnaryRulesByParent(state)){
nUnaries++;
int nChildSubStates = gr.numSubStates[uRule.childState];
double[][] scores = uRule.getScores2();
for (int j=0; j<scores.length; j++){
totalU+=nChildSubStates;
notNulledOutU++;
if (scores[j]==null){
nulledOutU++;
continue;
}
for (int i=0; i<nParentSubStates; i++){
if (!Double.isInfinite(scores[j][i])) notInfU++;
}
}
}
for (BinaryRule bRule : gr.splitRulesWithP(state)){//gr.getBinaryRulesByParent(state)){
nBinaries++;
double[][][] scores = bRule.getScores2();
for (int j=0; j<scores.length; j++){
for (int k=0; k<scores[j].length; k++){
totalB+=nParentSubStates;
notNulledOutB++;
if (scores[j][k]==null){
nulledOutB++;
continue;
}
for (int i=0; i<scores[j][k].length; i++){
if (!Double.isInfinite(scores[j][k][i])) notInfB++;
}
}
}
}
}
int totalUS=0, totalBS=0;
int notInfUS=0, notInfBS=0;
for (int state=0; state<gr.numStates; state++){
for (UnaryRule uRule : gr.getUnaryRulesByParent(state)){
double[][] scores = uRule.getScores2();
int nChildSubstates = gr.numSubStates[uRule.childState];
for (int j=0; j<scores.length; j++){
boolean okayInSomeSubstate = false;
if (scores[j]!=null){
for (int i=0; i<scores[j].length; i++){
if (!Double.isInfinite(scores[j][i])) okayInSomeSubstate=true;
}
}
totalUS+=nChildSubstates;
if (okayInSomeSubstate)
notInfUS+=nChildSubstates;
}
}
for (BinaryRule bRule : gr.splitRulesWithP(state)){//getBinaryRulesByParent(state)){
double[][][] scores = bRule.getScores2();
int nParentSubstates = gr.numSubStates[bRule.parentState];
for (int j=0; j<scores.length; j++){
for (int k=0; k<scores[0].length; k++){
boolean okayInSomeSubstate = false;
if (scores[j][k]!=null) {
for (int i=0; i<scores[j][k].length; i++){
if (!Double.isInfinite(scores[j][k][i])) okayInSomeSubstate = true;
}
}
totalBS+=nParentSubstates;
if (okayInSomeSubstate)
notInfBS+=nParentSubstates;
}
}
}
}
System.out.println("The baseline grammar has "+nUnaries+" unary and "+nBinaries+" binary rules.");
System.out.println("When using substates there could be "+totalU+" unaries, but in fact there are only "+notInfU+".");
System.out.println("When using substates there could be "+totalB+" binaries, but in fact there are only "+notInfB+".");
System.out.println("Out of "+notNulledOutU+" slices "+nulledOutU+" are nulled out.");
System.out.println("Out of "+notNulledOutB+" slices "+nulledOutB+" are nulled out.");
System.out.println("Summed across substates, there could be "+totalUS+" unaries, but there are only "+notInfUS+".");
System.out.println("Summed across substates, there could be "+totalBS+" binaries, but there are only "+notInfBS+".");
}
public static void printLexiconUnknownStatistics(Lexicon lexicon, Numberer tagNumberer) {
// System.out.print(
// "\n" +
// "LEXICON UNKNOWN TAGS\n" +
// " P(tag,substate | unknown signature)\n" +
// "\n" +
// " Unknown signature meanings:\n" +
// " -INITC only first letter is capitalized\n" +
// " -KNOWNLC word is known when in lowercase\n" +
// " -CAPS letter other than 1st is capitalized\n" +
// " -LC word has a lower-case letter\n" +
// " -NUM word contains a digit\n" +
// " -DASH word contains a dash\n" +
// " -s word is >=3 letters long, ends with s, and not 'is' or 'us'\n" +
// " The rest capture endings:\n" +
// " -ed\n" +
// " -ing\n" +
// " -ion\n" +
// " -er\n" +
// " -est\n" +
// " -ly\n" +
// " -ity\n" +
// " -y\n" +
// " -al\n");
// Map<String,double[][]> unk = lexicon.getUnseenScores();
// for (String sig : unk.keySet()) {
// System.out.println();
// System.out.println("signature "+sig);
// double[][] scores = unk.get(sig);
// int maxWidth = 0;
// int count = 0;
// for (int tag=0; tag<scores.length; tag++) {
// if (scores[tag]==null)
// continue;
// count++;
// maxWidth = Math.max(maxWidth,scores[tag].length);
// }
// String[][] out = new String[count][maxWidth];
// int tagIdx = 0;
// for (int tag=0; tag<scores.length; tag++) {
// if (scores[tag]==null)
// continue;
// for (int substate=0; substate<maxWidth; substate++) {
// if (substate >= scores[tag].length)
// out[tagIdx][substate] = "";
// else
// out[tagIdx][substate] = f.format(scores[tag][substate]);
// }
// tagIdx++;
// }
// printRules("nothing","not ready",false,out);
// }
}
public static void printLexiconStatistics(Lexicon lexicon, Numberer tagNumberer, boolean[] grammarTags, Grammar grammar, StateSetTreeList trainStateSetTrees, Options opts){
//printLexiconUnknownStatistics(lexicon, tagNumberer);
System.out.println("<a name=\"lexicon\"><h1>Lexicon</h1></a>");
System.out.println("<div id=\"lexicon\">");
double[][][] counts = null;
double[][] posteriors = new double[grammar.numStates][(int)ArrayUtil.max(grammar.numSubStates)];
if (lexicon instanceof SimpleLexicon){
counts = new double[grammar.numStates][((SimpleLexicon)lexicon).nWords][grammar.numSubStates[1]];
ParserData pDataNoLog = ParserData.Load(opts.in);
if (pDataNoLog == null) {
System.exit(1);
}
Grammar nonLogGrammar = pDataNoLog.getGrammar();
nonLogGrammar.splitRules();
SimpleLexicon nonLogLexicon = (SimpleLexicon)pDataNoLog.getLexicon();
nonLogLexicon.explicitlyComputeScores(nonLogGrammar.finalLevel);
SpanPredictor spanPredictor = pDataNoLog.getSpanPredictor();
// SophisticatedLexicon newLex = new SophisticatedLexicon(grammar.numSubStates, SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF, new double[]{0.5, 0.1}, new SmoothAcrossParentSubstate(0.1), 1.0e-30);
if (opts.unkT<0) {
System.out.println("Replacing rare words");
Corpus.replaceRareWords(trainStateSetTrees,new SimpleLexicon(grammar.numSubStates,-1), Math.abs(opts.unkT));
}
nonLogLexicon.labelTrees(trainStateSetTrees);
ConstrainedHierarchicalTwoChartParser parser = new ConstrainedHierarchicalTwoChartParser(nonLogGrammar, nonLogLexicon, spanPredictor, grammar.finalLevel);
// HiearchicalAdaptiveLinearizer linearizer = new HiearchicalAdaptiveLinearizer(nonLogGrammar, nonLogLexicon, spanPredictor, grammar.finalLevel);
// double[] counts = new double[linearizer.dimension()];
// int nTrees = trainStateSetTrees.size();
// boolean secondHalf;
// int n=0;
for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
// secondHalf = (n++>nTrees/2.0);
boolean noSmoothing = true, debugOutput = false;
parser.doInsideOutsideScores(stateSetTree,false,false);
grammar.tallyMergeWeights(stateSetTree, posteriors);
double tree_score = stateSetTree.getLabel().getIScore(0);
int tree_scale = stateSetTree.getLabel().getIScale();
List<StateSet> yield = stateSetTree.getYield();
int i =0;
for (StateSet stateSet : stateSetTree.getPreTerminalYield()){
double scalingFactor = ScalingTools.calcScaleFactor(stateSet.getOScale()+stateSet.getIScale()-tree_scale);
StateSet child = yield.get(i++);
for (short substate=0; substate<stateSet.numSubStates(); substate++) {
//weight by the probability of seeing the tag and word together, given the sentence
double pIS = stateSet.getIScore(substate); // Parent outside score
if (pIS==0) { continue; }
double pOS = stateSet.getOScore(substate); // Parent outside score
if (pOS==0) { continue; }
double weight = 1;
weight = (pIS / tree_score) * scalingFactor * pOS;
counts[stateSet.getState()][child.wordIndex][substate] += weight;
// if (isValidExpectation(weight)){
// tmpCountsArray[substate] = weight;
// } else
// System.out.println("Overflow when counting gold tags? "+weight);
}
}
// parser.incrementExpectedGoldCounts(linearizer, counts, stateSetTree);
// newLex.trainTree(stateSetTree, -1, nonLogLexicon, secondHalf,noSmoothing);
}
// newLex.optimize(); // M Step
// ParserData pData = new ParserData(newLex, nonLogGrammar, null, Numberer.getNumberers(), nonLogGrammar.numSubStates, 0, 1, Binarization.RIGHT);
// String outTmpName = opts.out + ".gr";
// System.out.println("Saving grammar to "+outTmpName+".");
// if (pData.Save(outTmpName)) System.out.println("Saving successful.");
// else System.out.println("Saving failed!");
// for (int i=0; i<counts.length; i++) counts[i] = Math.log(counts[i]);
// linearizer.delinearizeLexiconWeights(counts);
// lexicon = linearizer.getLexicon();
} else {
// ParserData pDataNoLog = ParserData.Load(opts.in);
// if (pDataNoLog == null) {
// System.exit(1);
// }
// Grammar nonLogGrammar = pDataNoLog.getGrammar();
// nonLogGrammar.splitRules();
// Lexicon nonLogLexicon = pDataNoLog.getLexicon();
// ArrayParser parser = new ArrayParser(nonLogGrammar,nonLogLexicon);
// for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
// parser.doInsideOutsideScores(stateSetTree,true,false);
// grammar.tallyMergeWeights(stateSetTree, posteriors);
// }
}
// System.out.println("Entropies");
// for (short curTag=0; curTag<grammarTags.length; curTag++){
// double total = 0;
// for (int substate=0; substate<grammar.numSubStates[substate]; substate++){
// total += posteriors[curTag][substate];
// }
// double entropy = 0;
// for (int substate=0; substate<grammar.numSubStates[substate]; substate++){
// double p = posteriors[curTag][substate] = posteriors[curTag][substate]/total;
// if (p==0) continue;
// entropy += (p * Math.log(p));
// }
// entropy *= -1.0;
// System.out.println(tagNumberer.object(curTag)+"\t"+entropy);
// }
// HashMap<String, double[]>[] wordToTagCounters = lexicon.wordToTagCounters;
for (short curTag=0; curTag<grammarTags.length; curTag++){
if (grammarTags[curTag]) continue;
int nSubStates = grammar.numSubStates[curTag];
PriorityQueue<String>[] pQs = new PriorityQueue[nSubStates];
for (int i = 0; i < nSubStates; i++) {
pQs[i] = new PriorityQueue<String>();
}
double[] sum = new double[grammar.numSubStates[curTag]];
if (lexicon instanceof SophisticatedLexicon){
sum = posteriors[curTag];
SophisticatedLexicon lex = (SophisticatedLexicon)lexicon;
HashMap<String, double[]> tagMap = lex.wordToTagCounters[curTag];
for (String word : tagMap.keySet()) {
double[] lexiconScores = lexicon.score(word,curTag,0,false,false);
// double[] counts = tagMap.get(word);
for (int i = 0; i < nSubStates; i++) {
pQs[i].add(word, lexiconScores[i]);//counts[i]);
}
}
}
else {
sum = new double[grammar.numSubStates[curTag]];
SimpleLexicon lex = (SimpleLexicon)lexicon;
for (int w=0; w<lex.nWords; w++) {
// int k = lex.tagWordIndexer[curTag].get(w);
// if (k>=lex.wordCounter.length||lex.wordCounter[k]<=51) continue;
String word = (String)lex.wordIndexer.get(w);
// System.out.println(word + " " +lex.wordCounter[k]+" ");
// double[] lexiconScores = lexicon.score(word,curTag,0,true,word.startsWith("UNK"));
double[] lexiconScores = counts[curTag][w];
boolean allZero=true;
for (int i=0; i<lexiconScores.length; i++){
allZero = allZero&&(lexiconScores[i]==0);
sum[i] += lexiconScores[i];
}
if (allZero) continue;
// for (short tag=0; tag<grammarTags.length; tag++){
// if (grammarTags[curTag]) continue;
// double[] lexiconScores2 = lexicon.score(word,tag,0,false,word.startsWith("UNK"));
// sum += DoubleArrays.sum(lexiconScores2);
// }
for (int i = 0; i < nSubStates; i++) {
pQs[i].add(word, lexiconScores[i]);//counts[i]);
}
}
}
double s=0;
for (int i=0; i<sum.length; i++){ s+= sum[i]; }
String tagName = (String) tagNumberer.object(curTag);
System.out.println("<h3>Lexicon</h3>");
System.out.println("<table border=\"1\">");
System.out.println("<tr>");
for (int i = 0; i < nSubStates; i++) {
System.out.println("<th>");
System.out.println("<a name=" + lexiconLabel(tagName + "-" + i)
+ "> <a href=" + parentRefLabel(tagName + "-" + i) + ">");
System.out.print(sanitize(tagName) + "-" + i);
System.out.println("</a></a> (<a href="+label("parent",tagName)+">p</a>)");
System.out.println("<br>"+sum[i]/s);
System.out.println("</th>");
}
System.out.println("</tr>");
for (int j = 0; j < topN; j++){
System.out.println("<tr>");
/* System.out.println("The top " + topN + " words for the tag "
+ (String) tagNumberer.object(curTag) + "-" + i + " are:");
System.out.println(pQs[i].toString(topN));
}
*/ for (int i = 0; i < nSubStates; i++) {
if (i==0){ System.out.print("\n"); }
String w="";
double p=-1;
if (pQs[i].hasNext()) {
p = pQs[i].getPriority();
w = pQs[i].next();
String tmp = sanitize(w)+" "+f.format(p);
if (tmp.length()<8) tmp = tmp.concat("\t");
System.out.print("<td>"+tmp+"</td>");
}
}
System.out.println("</tr>");
}
System.out.println("</table><br/>");
}
System.out.println("</div>");
}
/**
* @param tagName
* @return
*/
static String lexiconLabel(String tagName) {
return "\"productions-"+tagName+"\"";
}
/**
* @param ruleTypeName
* @param tagName
* @return
*/
static String label(String ruleTypeName, String tagName) {
return "\""+ruleTypeName+"-"+tagName+"\"";
}
static String reflabel(String ruleTypeName, String tagName) {
return "\"#"+ruleTypeName+"-"+tagName+"\"";
}
static String parentLabel(String tagName) {
return label("parentrules",tagName);
}
static String parentRefLabel(String tagName) {
return reflabel("parentrules",tagName);
}
static String sanitize(String s) {
return s.replaceAll("&","&");
}
}