package cmu.arktweetnlp.impl; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import cmu.arktweetnlp.util.BasicFileIO; import edu.berkeley.nlp.util.ArrayUtil; import edu.berkeley.nlp.util.Triple; import edu.stanford.nlp.math.ArrayMath; import edu.stanford.nlp.util.Pair; /** * This contains * * (1) Feature and label vocabularies (therefore knowledge of numberization) * (2) Model coefficients (and knowledge how to flattenize them for LBFGS's sake) * (3) Decoding/posterior and gradient computation */ public class Model { public Vocabulary labelVocab; public Vocabulary featureVocab; /** * dim: N_labels **/ public double[] biasCoefs; /** * dim: (N_labels+1 x N_labels) **/ public double[][] edgeCoefs; /** * dim: (N_base_features x N_labels) **/ public double[][] observationFeatureCoefs; public Model() { labelVocab = new Vocabulary(); featureVocab = new Vocabulary(); } public int numLabels; //initialized in loadModelFromText public int startMarker() { assert labelVocab.isLocked(); int lastLabel = labelVocab.size() - 1; return lastLabel+1; } public void lockdownAfterFeatureExtraction() { labelVocab.lock(); featureVocab.lock(); allocateCoefs(labelVocab.size(), featureVocab.size()); } public void allocateCoefs(int numLabels, int numObsFeats) { observationFeatureCoefs = new double[numObsFeats][numLabels]; edgeCoefs = new double[numLabels+1][numLabels]; biasCoefs = new double[numLabels]; } /** * "given labels" i.e. at trainingtime labels are observed. * You hide the current one and predict it given you know the previous. * So you get funny incremental posteriors per position that an MEMM uses at trainingtime. * (They don't have a proper full-model posterior marginal * interpretation like a CRF forward-backward-computed posterior does. no?) * * @param sentence - must its have .labels set * @returns posterior marginals, dim (T x N_label) */ public double[][] inferPosteriorGivenLabels(ModelSentence sentence) { double[][] posterior = new double[sentence.T][labelVocab.size()]; double[] labelScores = new double[numLabels]; for (int t=0; t<sentence.T; t++) { // start in log space computeLabelScores(t, sentence, labelScores); // switch to exp space ArrayUtil.expInPlace(labelScores); double Z = ArrayUtil.sum(labelScores); for (int k=0; k<numLabels; k++) { posterior[t][k] = labelScores[k] / Z; } // if (Math.random() < 0.00001) // System.out.printf("\n%s has %.3g\n%s\n", sentence.labels[t], // posterior[t][sentence.labels[t]], Util.sp(posterior[t])); } return posterior; } /** * THIS CLOBBERS THE LABELS, stores its decoding into them. * Does progressive rolling edge feature extraction **/ public void greedyDecode(ModelSentence sentence, boolean storeConfidences) { int T = sentence.T; sentence.labels = new int[T]; sentence.edgeFeatures[0] = startMarker(); if (storeConfidences) sentence.confidences = new double[T]; double[] labelScores = new double[numLabels]; for (int t=0; t<T; t++) { computeLabelScores(t, sentence, labelScores); sentence.labels[t] = ArrayMath.argmax(labelScores); if (t < T-1) sentence.edgeFeatures[t+1] = sentence.labels[t]; if (storeConfidences) { ArrayMath.expInPlace(labelScores); double Z = ArrayMath.sum(labelScores); ArrayMath.multiplyInPlace(labelScores, 1.0/Z); sentence.confidences[t] = labelScores[ sentence.labels[t] ]; } } } /** * This needs forward-backward I think * @return dim: (T x K) posterior marginals at each position */ public double[][] inferPosteriorForUnknownLabels(ModelSentence sentence) { assert false : "Unimplemented"; return null; } /** * vit[t][k] is the max probability such that the sequence * from 0 to t has token t labeled with tag k. (0<=t<T) * bptr[t][k] gives the max prob. tag of token t-1 (t=0->startMarker) */ public void viterbiDecode(ModelSentence sentence) { int T = sentence.T; sentence.labels = new int[T]; int[][] bptr = new int[T][numLabels]; double[][] vit = new double[T][numLabels]; double[] labelScores = new double[numLabels]; computeVitLabelScores(0, startMarker(), sentence, labelScores); ArrayUtil.logNormalize(labelScores); //initialization vit[0]=labelScores; for (int k=0; k < numLabels; k++){ bptr[0][k]=startMarker(); } for (int t=1; t < T; t++){ double[][] prevcurr = new double[numLabels][numLabels]; for (int s=0; s < numLabels; s++){ computeVitLabelScores(t, s, sentence, prevcurr[s]); ArrayUtil.logNormalize(prevcurr[s]); prevcurr[s] = ArrayUtil.add(prevcurr[s], labelScores[s]); } for (int s=0; s < numLabels; s++){ double[] sprobs = getColumn(prevcurr, s); bptr[t][s] = ArrayUtil.argmax(sprobs); vit[t][s] = sprobs[bptr[t][s]]; } labelScores=vit[t]; } sentence.labels[T-1] = ArrayUtil.argmax(vit[T-1]); //System.out.print(labelVocab.name(sentence.labels[T-1])); //System.out.println(" with prob: "+Math.exp(vit[T-1][sentence.labels[T-1]])); int backtrace = bptr[T-1][sentence.labels[T-1]]; for (int i=T-2; (i>=0)&&(backtrace != startMarker()); i--){ //termination sentence.labels[i] = backtrace; //System.err.println(labelVocab.name(backtrace) //+" with prob: "+Math.exp(vit[i][backtrace])); backtrace = bptr[i][backtrace]; } assert (backtrace == startMarker()); } private double[] getColumn(double[][] matrix, int col){ double[] column = new double[matrix.length]; for (int i=0; i<matrix[0].length; i++){ column[i] = matrix[i][col]; } return column; } public void mbrDecode(ModelSentence sentence) { double[][] posterior = inferPosteriorForUnknownLabels(sentence); for (int t=0; t < sentence.T; t++) { sentence.labels[t] = ArrayMath.argmax(posterior[t]); } } /** Computes unnormalized log-potentials. * CLOBBERS labelScores **/ public void computeLabelScores(int t, ModelSentence sentence, double[] labelScores) { Arrays.fill(labelScores, 0); computeBiasScores(labelScores); computeEdgeScores(t, sentence, labelScores); computeObservedFeatureScores(t, sentence, labelScores); } public void computeVitLabelScores(int t, int prior, ModelSentence sentence, double[] labelScores) { Arrays.fill(labelScores, 0); computeBiasScores(labelScores); viterbiEdgeScores(prior, sentence, labelScores); computeObservedFeatureScores(t, sentence, labelScores); } /** Adds into labelScores **/ public void computeBiasScores(double[] labelScores) { for (int k=0; k < numLabels; k++) { labelScores[k] += biasCoefs[k]; } } /** Adds into labelScores **/ public void computeEdgeScores(int t, ModelSentence sentence, double[] labelScores) { // Util.p(sentence.edgeFeatures); int prev = sentence.edgeFeatures[t]; for (int k=0; k < numLabels; k++) { labelScores[k] += edgeCoefs[prev][k]; } } /** @return dim T array s.t. labelScores[t]+=score of label prior followed by label t **/ public void viterbiEdgeScores(int prior, ModelSentence sentence, double[] EdgeScores) { for (int k=0; k < numLabels; k++) { EdgeScores[k] += edgeCoefs[prior][k]; } } /** Adds into labelScores **/ public void computeObservedFeatureScores(int t, ModelSentence sentence, double[] labelScores) { for (int k=0; k < numLabels; k++) { // for (int obsFeat : sentence.observationFeatures.get(t)) { for (Pair<Integer,Double> pair : sentence.observationFeatures.get(t)) { // labelScores[k] += observationFeatureCoefs[obsFeat][k]; labelScores[k] += observationFeatureCoefs[pair.first][k] * pair.second; } } } public double[] ThreewiseMultiply(double[] a, double[] b, double[] c) { if ((a.length != b.length) || (b.length!=c.length)) { throw new RuntimeException(); } double[] result = new double[a.length]; for(int i = 0; i < result.length; i++){ result[i] = a[i] * b[i] * c[i]; } return result; } /** * Training-only * * add-in loglik gradient (direction of higher likelihood) **/ public void computeGradient(ModelSentence sentence, double[] grad) { assert grad.length == flatIDsize(); int T = sentence.T; double[][] posterior = inferPosteriorGivenLabels(sentence); for (int t=0; t<T; t++) { int prevLabel = sentence.edgeFeatures[t]; int y = sentence.labels[t]; // add empirical counts, subtract model-expected-counts for (int k=0; k < numLabels; k++) { double p = posterior[t][k]; int empir = y==k ? 1 : 0; grad[biasFeature_to_flatID(k)] += empir - p; grad[edgeFeature_to_flatID(prevLabel, k)] += empir - p; for (Pair<Integer,Double> fv : sentence.observationFeatures.get(t)) { grad[observationFeature_to_flatID(fv.first, k)] += (empir - p) * fv.second; } } } } public double computeLogLik(ModelSentence s) { double[][] posterior = inferPosteriorGivenLabels(s); double loglik = 0; for (int t=0; t < s.T; t++) { int y = s.labels[t]; loglik += Math.log(posterior[t][y]); } return loglik; } ///////////////////////////////////////////////////////// // Flat-version conversion routines // (If this was C++ we could do something clever with memory layout instead to avoid this.) // (Or we could do said clever things in Java atop a flat representation, but that would be painful.) public void setCoefsFromFlat(double[] flatCoefs) { for (int k=0; k<numLabels; k++) { biasCoefs[k] = flatCoefs[biasFeature_to_flatID(k)]; } for (int prevLabel=0; prevLabel<numLabels+1; prevLabel++) { for (int k=0; k<numLabels; k++) { edgeCoefs[prevLabel][k] = flatCoefs[edgeFeature_to_flatID(prevLabel, k)]; } } for (int feat=0; feat < featureVocab.size(); feat++) { for (int k=0; k < numLabels; k++) { observationFeatureCoefs[feat][k] = flatCoefs[observationFeature_to_flatID(feat, k)]; } } } public double[] convertCoefsToFlat() { double[] flatCoefs = new double[flatIDsize()]; for (int k=0; k<numLabels; k++) { flatCoefs[biasFeature_to_flatID(k)] = biasCoefs[k]; } for (int prevLabel=0; prevLabel<numLabels+1; prevLabel++) { for (int k=0; k<numLabels; k++) { flatCoefs[edgeFeature_to_flatID(prevLabel, k)] = edgeCoefs[prevLabel][k]; } } for (int feat=0; feat < featureVocab.size(); feat++) { for (int k=0; k < numLabels; k++) { flatCoefs[observationFeature_to_flatID(feat, k)] = observationFeatureCoefs[feat][k]; } } return flatCoefs; } ///////////////////////////////////////////////////////////////////////// public int flatIDsize() { int K = labelVocab.size(); int J = featureVocab.size(); // bias terms + edge features + observation features return K + (K+1)*K + J*K; } private int biasFeature_to_flatID(int label) { return label; } private int edgeFeature_to_flatID(int before, int current) { int K = labelVocab.size(); return K + before*K + current; } private int observationFeature_to_flatID(int featID, int label) { int K = labelVocab.size(); return K + (K+1)*K + featID*K + label; } // public boolean isUnregularized(int flatFeatID) { // int K = labelVocab.size(); // return flatFeatID < K + (K+1)*K; // } // These appear to be unnecessary, and trickier to get correct anyway // private int flatID_to_biasFeature(int id) { // return id; // } // private int flatID_to_edgeFeatureBefore(int id) { // int K = labelVocab.size(); // return (int)( (id - K) / (K+1) ); // } // private int flatID_to_edgeFeatureAfter(int id) { // int K = labelVocab.size(); // return (id - K) % (K+1); // } // private int flatID_to_observationFeature(int id) { // int K = labelVocab.size(); // return id - K - (K+1)*K; // } ////////////////////////////////////////////////// /* todo, think about binary format. idea NumLabels\n[[binary blob for biases]][[binary blob for edge coefs]] NumObsFeats\n[[binary blob for obs feats]] where NumLabels and NumObsFeats are plaintext. there is no separator after the binary blobs, you infer that from NumLabels and NumObsFeats */ public void saveModelAsText(String outputFilename) throws IOException { BufferedWriter writer = BasicFileIO.openFileToWriteUTF8(outputFilename); PrintWriter out = new PrintWriter(writer); for (int k=0; k<numLabels; k++) { out.printf("***BIAS***\t%s\t%g\n", labelVocab.name(k), biasCoefs[k]); } for (int prevLabel=0; prevLabel < numLabels+1; prevLabel++) { for (int curLabel=0; curLabel < numLabels; curLabel++) { out.printf("***EDGE***\t%s %s\t%s\n", prevLabel, curLabel, edgeCoefs[prevLabel][curLabel]); } } assert featureVocab.size() == observationFeatureCoefs.length; for (int f=0; f < featureVocab.size(); f++) { for (int k=0; k < numLabels; k++) { if (observationFeatureCoefs[f][k]==0) continue; out.printf("%s\t%s\t%g\n", featureVocab.name(f), labelVocab.name(k), observationFeatureCoefs[f][k]); } } out.close(); writer.close(); } public static Model loadModelFromText(String filename) throws IOException { Model model = new Model(); BufferedReader reader = BasicFileIO.openFileOrResource(filename); String line; ArrayList<Double> biasCoefs = new ArrayList<Double>(); ArrayList< Triple<Integer, Integer, Double> > edgeCoefs = new ArrayList< Triple<Integer, Integer, Double> >(); ArrayList< Triple<Integer, Integer, Double> > obsCoefs = new ArrayList< Triple<Integer, Integer, Double> >(); while ( (line = reader.readLine()) != null ) { String[] parts = line.split("\t"); if ( ! parts[0].equals("***BIAS***")) break; model.labelVocab.num(parts[1]); biasCoefs.add(Double.parseDouble(parts[2])); } model.labelVocab.lock(); model.numLabels = model.labelVocab.size(); do { String[] parts = line.split("\t"); if ( ! parts[0].equals("***EDGE***")) break; String[] edgePair = parts[1].split(" "); int prev = Integer.parseInt(edgePair[0]); int cur = Integer.parseInt(edgePair[1]); edgeCoefs.add(new Triple(prev, cur, Double.parseDouble(parts[2]))); } while ( (line = reader.readLine()) != null ); do { String[] parts = line.split("\t"); int f = model.featureVocab.num(parts[0]); int k = model.labelVocab.num(parts[1]); obsCoefs.add(new Triple(f, k, Double.parseDouble(parts[2]))); } while ( (line = reader.readLine()) != null ); model.featureVocab.lock(); model.allocateCoefs(model.labelVocab.size(), model.featureVocab.size()); for (int k=0; k<model.numLabels; k++) { model.biasCoefs[k] = biasCoefs.get(k); } for (Triple<Integer,Integer,Double> x : edgeCoefs) { model.edgeCoefs[x.getFirst()][x.getSecond()] = x.getThird(); } for (Triple<Integer,Integer,Double> x : obsCoefs) { model.observationFeatureCoefs[x.getFirst()][x.getSecond()] = x.getThird(); } reader.close(); return model; } /** * Copies coefs from sourceModel into destModel. * For observation features, only copies features that exist in both. * (Therefore if a feature exists in destModel but not sourceModel, it's not touched.) */ public static void copyCoefsForIntersectingFeatures(Model sourceModel, Model destModel) { int K = sourceModel.numLabels; // We could do the name-checking intersection trick for label vocabs, but punt for now if (K != destModel.numLabels) throw new RuntimeException("label vocabs must be same size for warm-start"); for (int k=0; k < K; k++) { if ( ! destModel.labelVocab.name(k).equals(sourceModel.labelVocab.name(k))) { throw new RuntimeException("label vocabs must agree for warm-start"); } } destModel.biasCoefs = ArrayUtil.copy(sourceModel.biasCoefs); destModel.edgeCoefs = ArrayUtil.copy(sourceModel.edgeCoefs); // observation features need the intersection for (int sourceFeatID=0; sourceFeatID < sourceModel.featureVocab.size(); sourceFeatID++) { String featName = sourceModel.featureVocab.name(sourceFeatID); if (destModel.featureVocab.contains(featName)) { int destFeatID = destModel.featureVocab.num(featName); destModel.observationFeatureCoefs[destFeatID] = ArrayUtil.copy( sourceModel.observationFeatureCoefs[sourceFeatID] ); } } } }