package edu.berkeley.nlp.PCFGLA; import edu.berkeley.nlp.PCFGLA.smoothing.Smoother; import edu.berkeley.nlp.syntax.StateSet; import edu.berkeley.nlp.syntax.Tree; import edu.berkeley.nlp.util.ArrayUtil; import edu.berkeley.nlp.util.Counter; import edu.berkeley.nlp.util.Indexer; import edu.berkeley.nlp.util.Numberer; import edu.berkeley.nlp.util.ScalingTools; import java.io.Serializable; import java.util.*; /** * Simple default implementation of a lexicon, * which scores word, tag pairs with P(word|tag) * * instead of dealing with words as strings we will map them to integers * with a wordIndexer. to further simplify things each tag will have * its own mapping from wordIndex to a tag-specific index so that we * don't have to deal with unobserved events * * assumes that rare words have been replaced with some unknown word token */ public class SimpleLexicon implements java.io.Serializable, Lexicon { public IntegerIndexer[] tagWordIndexer; public double[][][] expectedCounts; // indexed by tag, substate, word public double[][][] scores; // indexed by tag, word, substate, substate public int[] wordCounter; // how many times each word occured // public boolean[] wordIsAmbiguous; /** A trick to allow loading of saved Lexicons even if the version has changed. */ private static final long serialVersionUID = 2L; /** The number of substates for each state */ public short[] numSubStates; int numStates; int nWords; double threshold; boolean isLogarithmMode; boolean useVarDP = false; public Indexer<String> wordIndexer; Smoother smoother; // additions from the stanford parser which are needed for a better // unknown word model... /** * We cache the last signature looked up, because it asks for the same one * many times when an unknown word is encountered! (Note that under the * current scheme, one unknown word, if seen sentence-initially and * non-initially, will be parsed with two different signatures....) */ protected transient String lastSignature = ""; protected transient int lastSentencePosition = -1; protected transient String lastWordToSignaturize = ""; private int unknownLevel = 5; //different modes for unknown words, 5 is english specific public void optimize() { for (int tag=0; tag<expectedCounts.length; tag++){ for (int substate=0; substate<numSubStates[tag]; substate++){ double mass = ArrayUtil.sum(expectedCounts[tag][substate]); double normalizer = (mass==0) ? 0 : 1.0/mass; for (int word=0; word<expectedCounts[tag][substate].length; word++){ scores[tag][substate][word] = expectedCounts[tag][substate][word]*normalizer; } } } // smooth the scores if (smoother!=null){ for (short tag=0; tag<expectedCounts.length; tag++){ for (int word=0; word<expectedCounts[tag][0].length; word++){ double[] res = new double[numSubStates[tag]]; for (int substate=0; substate<numSubStates[tag]; substate++){ res[substate] = scores[tag][substate][word]; } smoother.smooth(tag,res); for (int substate=0; substate<numSubStates[tag]; substate++){ scores[tag][substate][word] = res[substate]; } } } } } /** Create a blank Lexicon object. Fill it by * calling tallyStateSetTree for each training tree, then * calling optimize(). * * @param numSubStates */ @SuppressWarnings("unchecked") public SimpleLexicon(short[] numSubStates, int smoothingCutoff, double[] smoothParam, Smoother smoother, double threshold, StateSetTreeList trainTrees) { this(numSubStates, threshold); init(trainTrees); } public SimpleLexicon(short[] numSubStates, double threshold) { this.numSubStates = numSubStates; this.threshold = threshold; this.wordIndexer = new Indexer<String>(); this.numStates = numSubStates.length; this.isLogarithmMode = false; if (Corpus.myTreebank != Corpus.TreeBankType.WSJ || Corpus.myTreebank == Corpus.TreeBankType.BROWN) unknownLevel = 4; } public double[] score(String word, short tag, int pos, boolean noSmoothing, boolean isSignature) { StateSet stateSet = new StateSet(tag, (short)1, word, (short)pos, (short)(pos+1)); stateSet.wordIndex = -2; stateSet.sigIndex = -2; return score(stateSet,tag,noSmoothing,isSignature); } public double[] score(StateSet stateSet, short tag, boolean noSmoothing, boolean isSignature) { double[] res = new double[numSubStates[tag]]; int globalWordIndex = stateSet.wordIndex; if (globalWordIndex==-2) globalWordIndex = stateSet.wordIndex = wordIndexer.indexOf(stateSet.getWord()); if (globalWordIndex==-1) globalWordIndex = stateSet.sigIndex; if (globalWordIndex==-2) globalWordIndex = stateSet.sigIndex = wordIndexer.indexOf(getSignature(stateSet.getWord(), stateSet.from)); if (globalWordIndex==-1){ System.out.println("unknown signature for word "+stateSet.getWord()); Arrays.fill(res, 0.001); return res; } int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex); if (tagSpecificWordIndex==-1){ if (isLogarithmMode) Arrays.fill(res, Double.NEGATIVE_INFINITY);//-80??Double.NEGATIVE_INFINITY); // else Arrays.fill(res, 1e-80); return res; } for (int i=0; i<numSubStates[tag]; i++){ res[i] = scores[tag][i][tagSpecificWordIndex]; } if (smoother!=null) smoother.smooth(tag,res); return res; } /** * Trains this lexicon on the Collection of trees. */ public void trainTree(Tree<StateSet> trainTree, double randomness, Lexicon oldLexicon, boolean secondHalf, boolean noSmoothing, int unusedUnkThreshold) { // scan data //for all substates that the word's preterminal tag has double sentenceScore = 0; if (randomness == -1){ sentenceScore = trainTree.getLabel().getIScore(0); if (sentenceScore==0){ System.out.println("Something is wrong with this tree. I will skip it."); return; } } int sentenceScale = trainTree.getLabel().getIScale(); List<StateSet> words = trainTree.getYield(); List<StateSet> tags = trainTree.getPreTerminalYield(); //for all words in sentence for (int position = 0; position < words.size(); position++) { int nSubStates = tags.get(position).numSubStates(); short tag = tags.get(position).getState(); String word = words.get(position).getWord(); int globalWordIndex = wordIndexer.indexOf(word); int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalWordIndex); double[] oldLexiconScores = null; if (randomness==-1) oldLexiconScores = oldLexicon.score(word,tag,position,noSmoothing,false); StateSet currentState = tags.get(position); double scale = ScalingTools.calcScaleFactor(currentState.getOScale()-sentenceScale) / sentenceScore; for (short substate=0; substate<nSubStates; substate++) { double weight = 1; if (randomness == -1) { //weight by the probability of seeing the tag and word together, given the sentence if (!Double.isInfinite(scale)) weight = currentState.getOScore(substate) * oldLexiconScores[substate] * scale; else weight = Math.exp(Math.log(ScalingTools.SCALE) * (currentState.getOScale() - sentenceScale) - Math.log(sentenceScore) + Math.log(currentState.getOScore(substate)) + Math.log(oldLexiconScores[substate])); } else if (randomness==0){ // for the baseline weight = 1; } else { //add a bit of randomness weight = GrammarTrainer.RANDOM.nextDouble()*randomness/100.0+1.0; } if (weight==0) continue; //tally in the tag with the given weight expectedCounts[tag][substate][tagSpecificWordIndex] += weight; } } } public void setUseVarDP(boolean useVarDP) { this.useVarDP = useVarDP; } /* * assume that rare words have been replaced by their signature */ public void init(StateSetTreeList trainTrees){ for (Tree<StateSet> tree : trainTrees){ List<StateSet> words = tree.getYield(); for (StateSet word : words){ String sig = word.getWord(); wordIndexer.add(sig); } } tagWordIndexer = new IntegerIndexer[numStates]; for (int tag=0; tag<numStates; tag++){ tagWordIndexer[tag] = new IntegerIndexer(wordIndexer.size()); } wordCounter = new int[wordIndexer.size()]; for (Tree<StateSet> tree : trainTrees){ List<StateSet> tags = tree.getPreTerminalYield(); List<StateSet> words = tree.getYield(); int ind = 0; for (StateSet word : words){ String sig = word.getWord(); wordCounter[wordIndexer.indexOf(sig)]++; tagWordIndexer[tags.get(ind).getState()].add(wordIndexer.indexOf(sig)); ind++; } } expectedCounts = new double[numStates][][]; scores = new double[numStates][][]; for (int tag=0; tag<numStates; tag++){ expectedCounts[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()]; scores[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()]; } nWords = wordIndexer.size(); labelTrees(trainTrees); } public SimpleLexicon copyLexicon(){ SimpleLexicon copy = new SimpleLexicon(numSubStates,threshold); copy.expectedCounts = new double[numStates][][]; copy.scores = ArrayUtil.clone(scores);//new double[numStates][][]; copy.tagWordIndexer = new IntegerIndexer[numStates]; copy.wordIndexer = this.wordIndexer; for (int tag=0; tag<numStates; tag++){ copy.tagWordIndexer[tag] = tagWordIndexer[tag].copy(); copy.expectedCounts[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()]; // copy.scores[tag] = new double[numSubStates[tag]][tagWordIndexer[tag].size()]; } copy.nWords = this.nWords; copy.smoother = this.smoother; copy.wordCounter = this.wordCounter.clone(); // copy.wordIsAmbiguous = this.wordIsAmbiguous.clone(); // copy.unkIndex = unkIndex; /* if (linearIndex!=null) copy.linearIndex = ArrayUtil.clone(linearIndex); if (toBeIgnored!=null) copy.toBeIgnored = toBeIgnored.clone();*/ return copy; } public boolean isLogarithmMode() { return isLogarithmMode; } public void logarithmMode() { if (isLogarithmMode) return; for (int tag=0; tag<scores.length; tag++){ for (int word=0; word<scores[tag].length; word++){ for (int substate=0; substate<scores[tag][word].length; substate++){ scores[tag][word][substate] = Math.log(scores[tag][word][substate]); } } } isLogarithmMode = true; } /** * Split all substates in two, producing a new lexicon. The new Lexicon gives * the same scores to words under both split versions of the tag. (Leon says: * It may not be okay to use the same scores, but I think that symmetry is * sufficiently broken in Grammar.splitAllStates to ignore the randomness * here.) * * @param randomness * (currently ignored) * @param mode * 0 or 1: previous value plus noise * 2: just noise (for log-linear grammars with cascading regularization) * @return */ @SuppressWarnings("unchecked") public SimpleLexicon splitAllStates(int[] counts, boolean moreSubstatesThanCounts, int mode) { SimpleLexicon splitLex = this.copyLexicon(); short[] newNumSubStates = new short[numSubStates.length]; newNumSubStates[0] = 1; // never split ROOT Random random = GrammarTrainer.RANDOM; for (short i = 1; i < numSubStates.length; i++) { // don't split a state into more substates than times it was actaully seen // if (!moreSubstatesThanCounts && numSubStates[i]>=counts[i]) { // newNumSubStates[i]=numSubStates[i]; // } // else{ newNumSubStates[i] = (short)(numSubStates[i] * 2); // } } splitLex.numSubStates = newNumSubStates; double[][][] newScores = new double[scores.length][][]; double[][][] newExpCounts = new double[scores.length][][]; for (int tag=0; tag<expectedCounts.length; tag++){ int nTagWords = tagWordIndexer[tag].size(); // if (nWords==0) continue; newScores[tag] = new double[newNumSubStates[tag]][nTagWords]; newExpCounts[tag] = new double[newNumSubStates[tag]][nTagWords]; for (int substate=0; substate<numSubStates[tag]; substate++){ for (int word=0; word<expectedCounts[tag][substate].length; word++){ newScores[tag][2*substate][word] = newScores[tag][2*substate+1][word] = scores[tag][substate][word]; if (mode==2) newScores[tag][2*substate][word] = newScores[tag][2*substate+1][word] = 1.0+random.nextDouble()/100.0; } } } splitLex.scores = newScores; splitLex.expectedCounts = newExpCounts; return splitLex; } /** * This routine returns a String that is the "signature" of the class of a * word. * For, example, it might represent whether it is a number of ends in -s. * The strings returned by convention match the pattern UNK-.* , which * is just assumed to not match any real word. * Behavior depends on the unknownLevel (-uwm flag) passed in to the class. * The recognized numbers are 1-5: 5 is fairly English-specific; 4, 3, and 2 * look for various word features (digits, dashes, etc.) which are only * vaguely English-specific; 1 uses the last two characters combined with * a simple classification by capitalization. * * @param word The word to make a signature for * @param loc Its position in the sentence (mainly so sentence-initial * capitalized words can be treated differently) * @return A String that is its signature (equivalence class) */ public String getNewSignature(String word, int loc) { // int unknownLevel = Options.get().useUnknownWordSignatures; StringBuffer sb = new StringBuffer("UNK"); switch (unknownLevel) { case 5: { // Reformed Mar 2004 (cdm); hopefully much better now. // { -CAPS, -INITC ap, -LC lowercase, 0 } + // { -KNOWNLC, 0 } + [only for INITC] // { -NUM, 0 } + // { -DASH, 0 } + // { -last lowered char(s) if known discriminating suffix, 0} int wlen = word.length(); int numCaps = 0; boolean hasDigit = false; boolean hasDash = false; boolean hasLower = false; for (int i = 0; i < wlen; i++) { char ch = word.charAt(i); if (Character.isDigit(ch)) { hasDigit = true; } else if (ch == '-') { hasDash = true; } else if (Character.isLetter(ch)) { if (Character.isLowerCase(ch)) { hasLower = true; } else if (Character.isTitleCase(ch)) { hasLower = true; numCaps++; } else { numCaps++; } } } char ch0 = word.charAt(0); String lowered = word.toLowerCase(); if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) { if (loc == 0 && numCaps == 1) { sb.append("-INITC"); if (isKnown(lowered)) { sb.append("-KNOWNLC"); } } else { sb.append("-CAPS"); } } else if (!Character.isLetter(ch0) && numCaps > 0) { sb.append("-CAPS"); } else if (hasLower) { // (Character.isLowerCase(ch0)) { sb.append("-LC"); } if (hasDigit) { sb.append("-NUM"); } if (hasDash) { sb.append("-DASH"); } if (lowered.endsWith("s") && wlen >= 3) { // here length 3, so you don't miss out on ones like 80s char ch2 = lowered.charAt(wlen - 2); // not -ess suffixes or greek/latin -us, -is if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') { sb.append("-s"); } } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) { // don't do for very short words; // Implement common discriminating suffixes /* if (Corpus.myLanguage==Corpus.GERMAN){ sb.append(lowered.substring(lowered.length()-1)); }else{*/ if (lowered.endsWith("ed")) { sb.append("-ed"); } else if (lowered.endsWith("ing")) { sb.append("-ing"); } else if (lowered.endsWith("ion")) { sb.append("-ion"); } else if (lowered.endsWith("er")) { sb.append("-er"); } else if (lowered.endsWith("est")) { sb.append("-est"); } else if (lowered.endsWith("ly")) { sb.append("-ly"); } else if (lowered.endsWith("ity")) { sb.append("-ity"); } else if (lowered.endsWith("y")) { sb.append("-y"); } else if (lowered.endsWith("al")) { sb.append("-al"); // } else if (lowered.endsWith("ble")) { // sb.append("-ble"); // } else if (lowered.endsWith("e")) { // sb.append("-e"); } } break; } case 4: { boolean hasDigit = false; boolean hasNonDigit = false; boolean hasLetter = false; boolean hasLower = false; boolean hasDash = false; boolean hasPeriod = false; boolean hasComma = false; for (int i = 0; i < word.length(); i++) { char ch = word.charAt(i); if (Character.isDigit(ch)) { hasDigit = true; } else { hasNonDigit = true; if (Character.isLetter(ch)) { hasLetter = true; if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) { hasLower = true; } } else { if (ch == '-') { hasDash = true; } else if (ch == '.') { hasPeriod = true; } else if (ch == ',') { hasComma = true; } } } } // 6 way on letters if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) { if (!hasLower) { sb.append("-AC"); } else if (loc == 0) { sb.append("-SC"); } else { sb.append("-C"); } } else if (hasLower) { sb.append("-L"); } else if (hasLetter) { sb.append("-U"); } else { // no letter sb.append("-S"); } // 3 way on number if (hasDigit && !hasNonDigit) { sb.append("-N"); } else if (hasDigit) { sb.append("-n"); } // binary on period, dash, comma if (hasDash) { sb.append("-H"); } if (hasPeriod) { sb.append("-P"); } if (hasComma) { sb.append("-C"); } if (word.length() > 3) { // don't do for very short words: "yes" isn't an "-es" word // try doing to lower for further densening and skipping digits char ch = word.charAt(word.length() - 1); if (Character.isLetter(ch)) { sb.append("-"); sb.append(Character.toLowerCase(ch)); } } break; } case 3: { // This basically works right, except note that 'S' is applied to all // capitalized letters in first word of sentence, not just first.... sb.append("-"); char lastClass = '-'; // i.e., nothing char newClass; int num = 0; for (int i = 0; i < word.length(); i++) { char ch = word.charAt(i); if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) { if (loc == 0) { newClass = 'S'; } else { newClass = 'L'; } } else if (Character.isLetter(ch)) { newClass = 'l'; } else if (Character.isDigit(ch)) { newClass = 'd'; } else if (ch == '-') { newClass = 'h'; } else if (ch == '.') { newClass = 'p'; } else { newClass = 's'; } if (newClass != lastClass) { lastClass = newClass; sb.append(lastClass); num = 1; } else { if (num < 2) { sb.append('+'); } num++; } } if (word.length() > 3) { // don't do for very short words: "yes" isn't an "-es" word // try doing to lower for further densening and skipping digits char ch = Character.toLowerCase(word.charAt(word.length() - 1)); sb.append('-'); sb.append(ch); } break; } case 2: { // {-ALLC, -INIT, -UC, -LC, zero} + // {-DASH, zero} + // {-NUM, -DIG, zero} + // {lowerLastChar, zeroIfShort} boolean hasDigit = false; boolean hasNonDigit = false; boolean hasLower = false; for (int i = 0; i < word.length(); i++) { char ch = word.charAt(i); if (Character.isDigit(ch)) { hasDigit = true; } else { hasNonDigit = true; if (Character.isLetter(ch)) { if (Character.isLowerCase(ch) || Character.isTitleCase(ch)) { hasLower = true; } } } } if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) { if (!hasLower) { sb.append("-ALLC"); } else if (loc == 0) { sb.append("-INIT"); } else { sb.append("-UC"); } } else if (hasLower) { // if (Character.isLowerCase(word.charAt(0))) { sb.append("-LC"); } // no suffix = no (lowercase) letters if (word.indexOf('-') >= 0) { sb.append("-DASH"); } if (hasDigit) { if (!hasNonDigit) { sb.append("-NUM"); } else { sb.append("-DIG"); } } else if (word.length() > 3) { // don't do for very short words: "yes" isn't an "-es" word // try doing to lower for further densening and skipping digits char ch = word.charAt(word.length() - 1); sb.append(Character.toLowerCase(ch)); } // no suffix = short non-number, non-alphabetic break; } default: sb.append("-"); sb.append(word.substring(Math.max(word.length() - 2, 0), word.length())); sb.append("-"); if (Character.isLowerCase(word.charAt(0))) { sb.append("LOWER"); } else { if (Character.isUpperCase(word.charAt(0))) { if (loc == 0) { sb.append("INIT"); } else { sb.append("UPPER"); } } else { sb.append("OTHER"); } } } // end switch (unknownLevel) // System.err.println("Summarized " + word + " to " + sb.toString()); return sb.toString(); } // end getSignature() public String toString() { StringBuffer sb = new StringBuffer(); Numberer tagNumberer = Numberer.getGlobalNumberer("tags"); for (int tag=0; tag<expectedCounts.length; tag++){ String tagS = (String)tagNumberer.object(tag); if (tagWordIndexer[tag].size()==0) continue; for (int word=0; word<scores[tag][0].length; word++){ sb.append(tagS+" "+ wordIndexer.get(tagWordIndexer[tag].get(word))+" "); for (int sub=0; sub<numSubStates[tag]; sub++){ sb.append(" " + scores[tag][sub][word]); } sb.append("\n"); } } return sb.toString(); } /** * @param lowered * @return */ private boolean isKnown(String word) { return wordIndexer.indexOf(word)!=-1; } /** * Returns the index of the signature of the word numbered wordIndex, * where the signature is the String representation of unknown word * features. Caches the last signature index returned. */ public String getSignature(String word, int sentencePosition) { if (word.equals(lastWordToSignaturize) && sentencePosition == lastSentencePosition) { // System.err.println("Signature: cache mapped " + wordIndex + " to " + lastSignatureIndex); return lastSignature; } else { String uwSig = getNewSignature(word, sentencePosition); lastSignature = uwSig; lastSentencePosition = sentencePosition; lastWordToSignaturize = word; return uwSig; } } /** * @param mergeThesePairs * @param mergeWeights */ public void mergeStates(boolean[][][] mergeThesePairs, double[][] mergeWeights) { short[] newNumSubStates = new short[numSubStates.length]; short[][] mapping = new short[numSubStates.length][]; //invariant: if partners[state][substate][0] == substate, it's the 1st one short[][][] partners = new short[numSubStates.length][][]; Grammar.calculateMergeArrays(mergeThesePairs,newNumSubStates,mapping,partners,numSubStates); double[][][] newScores = new double[scores.length][][]; for (int tag=0; tag<expectedCounts.length; tag++){ int nTagWords = tagWordIndexer[tag].size(); newScores[tag] = new double[newNumSubStates[tag]][nTagWords]; if (numSubStates[tag]==1) continue; for (int word=0; word<expectedCounts[tag][0].length; word++){ for (int i=0; i<numSubStates[tag]; i=i+2) { int nSplit=partners[tag][i].length; if (nSplit==2) { double mergeWeightSum = mergeWeights[tag][partners[tag][i][0]] + mergeWeights[tag][partners[tag][i][1]]; if (mergeWeightSum==0) mergeWeightSum = 1; newScores[tag][mapping[tag][i]][word] = ((mergeWeights[tag][partners[tag][i][0]] * scores[tag][partners[tag][i][0]][word])+ (mergeWeights[tag][partners[tag][i][1]] * scores[tag][partners[tag][i][1]][word])) / mergeWeightSum; } else { newScores[tag][mapping[tag][i]][word] = scores[tag][i][word]; newScores[tag][mapping[tag][i+1]][word] = scores[tag][i+1][word]; } } } } this.numSubStates = newNumSubStates; this.scores = newScores; for (int tag=0; tag<numStates; tag++){ this.expectedCounts[tag] = new double[newNumSubStates[tag]][tagWordIndexer[tag].size()]; } } public void removeUnlikelyTags(double threshold, double exponent){ for (int tag=0; tag<scores.length; tag++){ for (int word=0; word<scores[tag].length; word++){ for (int substate=0; substate<scores[tag][word].length; substate++){ double p = scores[tag][word][substate]; /*if (p<threshold) p = 0; else */if (exponent!=1.0) p = Math.pow(p, exponent); scores[tag][word][substate] = p; } } } } // public void logarithmMode() { // logarithmMode = true; // } // // public boolean isLogarithmMode() { // return logarithmMode; // } public SimpleLexicon projectLexicon(double[] condProbs, int[][] mapping, int[][] toSubstateMapping) { short[] newNumSubStates = new short[numSubStates.length]; for (int state=0; state<numSubStates.length; state++){ newNumSubStates[state] = (short)toSubstateMapping[state][0]; } SimpleLexicon newLexicon = this.copyLexicon(); double[][][] newScores = new double[scores.length][][]; for (short tag=0; tag<expectedCounts.length; tag++){ newScores[tag] = new double[newNumSubStates[tag]][expectedCounts[tag][0].length]; for (int word=0; word<expectedCounts[tag][0].length; word++){ for (int substate=0; substate<numSubStates[tag]; substate++){ newScores[tag][toSubstateMapping[tag][substate+1]][word] += condProbs[mapping[tag][substate]]*scores[tag][substate][word]; } } } newLexicon.numSubStates = newNumSubStates; newLexicon.scores = newScores; return newLexicon; } public Smoother getSmoother() { return smoother; } /* (non-Javadoc) * @see edu.berkeley.nlp.HDPPCFG.LexiconInterface#getSmoothingParams() */ public double[] getSmoothingParams() { // TODO Auto-generated method stub return null; } /* (non-Javadoc) * @see edu.berkeley.nlp.HDPPCFG.LexiconInterface#logarithmMode() */ public void setSmoother(Smoother smoother) { this.smoother = smoother; } public double getPruningThreshold() { return threshold; } /* (non-Javadoc) * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreSignature(java.lang.String, short, int) */ public double[] scoreSignature(StateSet stateSet, int tag) { // TODO Auto-generated method stub return null; } /* (non-Javadoc) * @see edu.berkeley.nlp.PCFGLA.Lexicon#scoreWord(java.lang.String, short) */ public double[] scoreWord(StateSet stateSet, int tag) { // TODO Auto-generated method stub return null; } public void labelTrees(StateSetTreeList trainTrees){ for (Tree<StateSet> tree : trainTrees){ List<StateSet> words = tree.getYield(); // List<StateSet> tags = tree.getPreTerminalYield(); // int ind = 0; for (StateSet word : words){ word.wordIndex = wordIndexer.indexOf(word.getWord()); word.sigIndex = -1; // short tag = tags.get(ind).getState(); //// if (wordIsAmbiguous[word.wordIndex]) { // String sig = getSignature(word.getWord(), ind); // wordIndexer.add(sig); // word.sigIndex = (short)wordIndexer.indexOf(sig); // tagWordIndexer[tag].add(wordIndexer.indexOf(sig)); //// } //// else { word.sigIndex = -1; } // ind++; } } } /* public void clearMapping() { toBeIgnored = null; linearIndex = null; } */ public class IntegerIndexer implements Serializable{ private int[] indexTo; private int[] indexFrom; private int n; IntegerIndexer(int capacity){ indexTo = new int[capacity]; indexFrom = new int[capacity]; Arrays.fill(indexTo, -1); Arrays.fill(indexFrom, -1); n = 0; } public void add(int i){ if (i==-1) return; if (indexTo[i]==-1){ indexTo[i] = n; indexFrom[n] = i; n++; } } public int get(int i){ if (i < indexFrom.length) return indexFrom[i]; else return -1; } public int indexOf(int i){ if (i < indexTo.length) return indexTo[i]; else return -1; } public int size(){ return n; } public IntegerIndexer copy(){ IntegerIndexer copy = new IntegerIndexer(indexFrom.length); copy.n = n; copy.indexFrom = this.indexFrom.clone(); copy.indexTo = this.indexTo.clone(); return copy; } } /* (non-Javadoc) * @see edu.berkeley.nlp.PCFGLA.Lexicon#computeScores() */ public void explicitlyComputeScores(int finalLevel) { // TODO Auto-generated method stub } public Counter<String> getWordCounter() { return null; } public void tieRareWordStats(int threshold) { return; } }