AffineProbMetric.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    AffineProbMetric.java
 *    Copyright (C) 2002-3 Mikhail Bilenko
 *
 */

package weka.deduping.metrics;

import java.text.*;
import java.io.*;
import java.util.*;
import weka.core.*;
import weka.deduping.*;
  

/** AffineProbMetric class implements a probabilistic model string edit distance with affine-cost gaps
 *
 *   @author Mikhail Bilenko<mbilenko@cs.utexas.edu>
 *   @version 1.1
 **/
public class AffineProbMetric extends StringMetric implements LearnableStringMetric, Serializable, OptionHandler {

  /* Current probabilities, log-probabilities and accumulated expectations
     for each edit operation */
  protected double [][] m_editopProbs;
  protected double [][] m_editopLogProbs;
  protected double [][] m_editopOccs;

  /** Parameters for the generative model */
  protected double m_noopProb, m_noopLogProb, m_noopOccs;  // matching
  protected double m_endAtSubProb, m_endAtSubLogProb, m_endAtSubOccs; // ending the alignment at M state
  protected double m_endAtGapProb, m_endAtGapLogProb, m_endAtGapOccs; // ending the alignment at D/I states
  protected double m_gapStartProb, m_gapStartLogProb, m_gapStartOccs; // starting a gap in the alignment
  protected double m_gapExtendProb, m_gapExtendLogProb, m_gapExtendOccs; // extending a gap in the alignment
  protected double m_gapEndProb, m_gapEndLogProb, m_gapEndOccs;  // ending a gap in the alignment
  protected double m_subProb, m_subLogProb, m_subOccs;  // continuing to match/substitute in state M

  /** parameters for the additive model, obtained from log-probs to speed up
      computations in the "testing" phase after weights have been learned */
  protected double [][] m_editopCosts; 
  protected double m_noopCost;
  protected double m_endAtSubCost; 
  protected double m_endAtGapCost; 
  protected double m_gapStartCost;
  protected double m_gapExtendCost;
  protected double m_gapEndCost;
  protected double m_subCost;

  /** true if we are using a generative model for distance in the "testing" phase after learning the parameters
      By default we want to use the additive model that uses probabilities converted to costs*/
  protected boolean m_useGenerativeModel = false;

  /** Maximum number of iterations for training the model; usually converge in <10 iterations */
  protected int m_numIterations = 20;

  /** Normalization of edit distance by string length; equivalent to
    using the posterior probability in the generative model*/
  protected boolean m_normalized = true;

  /** Minimal value of a probability parameter.  Particularly important when
   training sets are small to prevent zero probabilities. */
  protected double m_clampProb = 1e-5;

  /** A handy constant for insertions/deletions, we treat them as substitution with a null character */
  protected final char blank = 0;

  /** TODO:  given a corpus, populate this array with the characters that are actually encountered */
  protected char [] m_usedChars = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
				    'q','r','s','t','u','v','w','x','y','z',' ','!','\"','#','$','%',
				    '&','\'','(',')','*','+',',','-','.','/','0','1','2','3','4','5','6',
				    '7','8','9',':',';','<','=','>','?','@','[','\\',']','^','_','`','{',
				    '|','}','~'};

  
  /** We can have different ways of converting from distance to similarity  */
  public static final int CONVERSION_LAPLACIAN = 1;
  public static final int CONVERSION_UNIT = 2;
  public static final int CONVERSION_EXPONENTIAL = 4;
  public static final Tag[] TAGS_CONVERSION = {
    new Tag(CONVERSION_UNIT, "similarity = 1-distance"),
    new Tag(CONVERSION_LAPLACIAN, "similarity=1/(1+distance)"),
    new Tag(CONVERSION_EXPONENTIAL, "similarity=exp(-distance)")
      };
  /** The method of converting, by default laplacian */
  protected int m_conversionType = CONVERSION_EXPONENTIAL;

  protected boolean m_verbose = false; 

  /**
   * set up an instance of AffineProbMetric
   */
  public AffineProbMetric () {
    m_editopProbs = new double[128][128];
    m_editopLogProbs = new double[128][128];
    m_editopOccs = new double[128][128];
    m_editopCosts = new double[128][128];
    initProbs();
    normalizeEmissionProbs();
    normalizeTransitionProbs();
    updateLogProbs();
    initCosts();
  }

  /**
   * Calculate the forward matrices
   * @param _s1 first string
   * @param _s2 second string
   * @return m_endAtSubProb*matrix[l1][l2][0] + m_endAtGapProb(matrix[l1][l2][1] +
   * matrix[l1][l2][2]) extendains the distance value
   */
  protected double[][][] forward (String _s1, String _s2) {
    char [] s1 = _s1.toCharArray();
    char [] s2 = _s2.toCharArray();
    int l1 = s1.length, l2 = s2.length;
    double matrix[][][] = new double[l1 + 1][l2 + 1][3];
    double tmpLog, subProb, tmpLog1;

    // initialization
    for (int i = 0; i <=l1; i++)
      matrix[i][0][0] = matrix[i][0][1] = matrix[i][0][2] = Double.NEGATIVE_INFINITY;
    for (int j = 1; j <=l2; j++)
      matrix[0][j][0] = matrix[0][j][1] = matrix[0][j][2] = Double.NEGATIVE_INFINITY;
    matrix[0][0][0] = 0;
	
    // border rows
    for (int j = 1; j <=l2; j++) {
      tmpLog = logSum(m_gapExtendLogProb + matrix[0][j-1][2], m_gapStartLogProb + matrix[0][j-1][0]); 
      matrix[0][j][2] = m_editopLogProbs[blank][s2[j-1]] + tmpLog;
    }
    for (int i = 1; i <= l1; i++) {
      tmpLog = logSum(m_gapStartLogProb + matrix[i-1][0][0], m_gapExtendLogProb + matrix[i-1][0][1]);
      matrix[i][0][1] = m_editopLogProbs[blank][s1[i-1]] + tmpLog;
    }
    
    // the rest
    for (int i = 1; i <= l1; i++) {
      for (int j = 1; j <= l2; j++) {
	tmpLog = logSum(m_gapStartLogProb + matrix[i-1][j][0], m_gapExtendLogProb + matrix[i-1][j][1]);
	matrix[i][j][1] = m_editopLogProbs[blank][s1[i-1]] + tmpLog;
	tmpLog = logSum(m_gapExtendLogProb + matrix[i][j-1][2], m_gapStartLogProb + matrix[i][j-1][0]);
	matrix[i][j][2] = m_editopLogProbs[blank][s2[j-1]] + tmpLog;
	subProb = ((s1[i-1] == s2[j-1]) ? m_noopLogProb : m_editopLogProbs[s1[i-1]][s2[j-1]]);
	tmpLog1 = logSum(m_subLogProb + matrix[i-1][j-1][0], m_gapEndLogProb + matrix[i-1][j-1][2]);
	tmpLog = logSum(m_gapEndLogProb + matrix[i-1][j-1][1], tmpLog1);
	matrix[i][j][0] =  subProb + tmpLog;
      }
    }
    return matrix;
  }

  /**
   * Calculate the backward matrices
   * @param _s1 first string
   * @param _s2 second string
   * @return matrix[0][0][0] extendains the distance value
   */
  protected double[][][] backward (String _s1, String _s2) {
    char [] s1 = _s1.toCharArray();
    char [] s2 = _s2.toCharArray();
    int l1 = s1.length, l2 = s2.length;
    double matrix[][][] = new double[l1 + 1][l2 + 1][3];
    double sub_pairProb, del_charProb, ins_charProb, tmpLog;

    // initialize
    for (int i = 0; i <=l1; i++)
      matrix[i][l2][0] = matrix[i][l2][1] = matrix[i][l2][2] = Double.NEGATIVE_INFINITY;
    for (int j = 0; j <=l2; j++)
      matrix[l1][j][0] = matrix[l1][j][1] = matrix[l1][j][2] = Double.NEGATIVE_INFINITY;
    matrix[l1][l2][0] = m_endAtSubLogProb;
    matrix[l1][l2][1] = matrix[l1][l2][2] = m_endAtGapLogProb;

    // border rows
    for (int i = l1-1; i >= 0; i--) {
      matrix[i][l2][0] = m_editopLogProbs[blank][s1[i]] + m_gapStartLogProb + matrix[i+1][l2][1];
      matrix[i][l2][1] = m_editopLogProbs[blank][s1[i]] + m_gapExtendLogProb + matrix[i+1][l2][1];
    }
    for (int j = l2-1; j >= 0; j--) {
      matrix[l1][j][0] = m_editopLogProbs[blank][s2[j]] + m_gapStartLogProb + matrix[l1][j+1][2];
      matrix[l1][j][2] = m_editopLogProbs[blank][s2[j]] + m_gapExtendLogProb + matrix[l1][j+1][2];
    }
    // fill the rest of the matrix
    for (int i = l1-1; i >= 0; i--) {
      for (int j = l2-1; j >= 0; j--) {
	ins_charProb = m_editopLogProbs[blank][s1[i]];
	del_charProb = m_editopLogProbs[blank][s2[j]];
	sub_pairProb = ((s1[i] == s2[j]) ? m_noopLogProb : m_editopLogProbs[s1[i]][s2[j]]);
	matrix[i][j][1] = logSum(ins_charProb + m_gapExtendLogProb + matrix[i+1][j][1],
				 sub_pairProb + m_gapEndLogProb + matrix[i+1][j+1][0]);
	matrix[i][j][2] = logSum(del_charProb + m_gapExtendLogProb + matrix[i][j+1][2],
				 sub_pairProb + m_gapEndLogProb + matrix[i+1][j+1][0]);
	tmpLog = logSum(ins_charProb + matrix[i+1][j][1], del_charProb + matrix[i][j+1][2]);
	matrix[i][j][0] = logSum(sub_pairProb + m_subLogProb + matrix[i+1][j+1][0],
				 m_gapStartLogProb + tmpLog);
      }
    }
    return matrix;
  }


    /**
   * print out the three matrices
   */
  public void printMatrices(String s1, String s2) {
    double[][][] forward = forward(s1, s2);
    double[][][] backward = backward(s1, s2);
    int l1 = s1.length(), l2 = s2.length();

    double totalForward = logSum(m_endAtSubLogProb + forward[l1][l2][0], m_endAtGapLogProb + forward[l1][l2][1]);
    totalForward = logSum(totalForward, m_endAtGapLogProb + forward[l1][l2][2]);
    System.out.println("\nB:" + backward[0][0][0] + "\tF:" + totalForward);
    
    System.out.println("\n***FORWARD***\nSUBSTITUTION:");
    printAlignmentMatrix(s1, s2, 0, forward);

    System.out.println("\n\nDELETION:");
    printAlignmentMatrix(s1, s2, 1, forward);

    System.out.println("\n\nINSERTION:");
    printAlignmentMatrix(s1, s2, 2, forward);


    System.out.println("\n***BACKWARD***\nSUBSTITUTION:");
    printAlignmentMatrix(s1, s2, 0, backward);

    System.out.println("\n\nDELETION:");
    printAlignmentMatrix(s1, s2, 1, backward);

    System.out.println("\n\nINSERTION:");
    printAlignmentMatrix(s1, s2, 2, backward);
  }

  public void printAlignmentMatrix(String _s1, String _s2, int idx, double[][][] matrix) {
    DecimalFormat fmt = new DecimalFormat ("0.000");
    char[] s1 = _s1.toCharArray();
    char[] s2 = _s1.toCharArray();
    
    System.out.print('\t');
    for (int i = 0; i < s2.length; i++) {
      System.out.print("\t" + s2[i]);
    }
    System.out.println();
    for (int i = 0; i < matrix.length; i++) {
      if (i > 0) System.out.print(s1[i-1] + "\t");  else System.out.print("\t");
      for (int j = 0; j < matrix[i].length; j++) {
	System.out.print(fmt.format(matrix[i][j][idx]) + "\t");
      }
      System.out.println();
    }
  } 
  
  /**
   * print out some data in case things go wrong
   */
  protected void printOpProbs() {
    System.out.println("extend_gap_op.prob=" + m_gapExtendProb +
		       "  end_gap_op.prob=" + m_gapEndProb +
		       "  subst_op.prob=" + m_subProb);
  }


        
  /**
   *  Train the distance parameters using provided examples using EM
   * @param matched_pairs Each member is a String[] extendaining two matching fields
   * @param matched_pairs Each member is a String[] extendaining two non-matching fields
   */ 
  public void trainMetric (ArrayList pairList) throws Exception {
    initProbs();
    recordCosts(0);

    // convert the training data to lower case
    for (int j = 0; j < pairList.size(); j++) {
      StringPair pair = (StringPair)pairList.get(j);
      pair.str1 = pair.str1.toLowerCase();
      pair.str2 = pair.str2.toLowerCase();
    }
	
    try {
      // dump out the current probablities
      PrintWriter out = new PrintWriter(new FileWriter("/tmp/probs1"));

      double totalProb = 0;
      double prevTotalProb = -Double.MAX_VALUE;
      for (int i = 1; i <= m_numIterations && Math.abs(totalProb - prevTotalProb) > 1; i++) {
	resetOccurrences();
	out.println(i + "\t" + m_endAtSubProb + "\t" + m_subProb + "\t" + m_gapStartProb +
		    "\t" + m_endAtGapProb + "\t" + m_gapEndProb + "\t" + m_gapExtendProb + "\t" + m_noopProb);

	// go through positives
	prevTotalProb = totalProb;
	totalProb = 0;
	for (int j = 0; j < pairList.size(); j++) {
	  StringPair pair = (StringPair)pairList.get(j);
	  if (pair.positive) {
	    totalProb += expectationStep (pair.str1, pair.str2, 1, true);
	  }
	}
	// go through negatives  - TODO - discriminative training
	//	    for (int j = 0; j < negExamples.length; j++)
	//		expectationStep (negExamples[j][1], negExamples[j][0], 1, false);

	System.out.println(i + ". Total likelihood=" + totalProb + ";  prev=" + prevTotalProb);
	System.out.println("************ Accumulated expectations ******************** ");
	System.out.println("End_s=" + m_endAtSubOccs + "\tSub=" + m_subOccs + "\tStGap=" + m_gapStartOccs +
			   "\nEnd_g=" + m_endAtGapOccs + "\tEndGap=" + m_gapEndOccs + " ContGap=" + m_gapExtendOccs +
			   "\nNoop=" + m_noopOccs);
	System.out.println("********************************");
	maximizationStep ();
      }
      out.close();
    } catch (Exception e) { e.printStackTrace();}
    initCosts();
    recordCosts(1);
  }
    
  /**
   * Expectation  part of the EM algorithm
   *  accumulates expectations of editop probabilities over example pairs
   *  Expectation is calculated based on two examples which are either duplicates (pos=true)
   *  or non-duplicates (pos=false).  Lambda is a weighting parameter, 1 by default.
   * @param _s1 first string
   * @param _s2 second string
   * @param lambda learning rate parameter, 1 by default
   * @param pos_training true if strings are matched, false if mismatched
   */
  protected double expectationStep (String _s1, String _s2, int lambda, boolean pos_training) {
    int l1 = _s1.length(), l2 = _s2.length();
    if (l1 == 0 || l2 == 0) {
      return 0;
    }
    char [] s1 = _s1.toCharArray();
    char [] s2 = _s2.toCharArray();
    double fMatrix[][][] = forward (_s1, _s2);
    double bMatrix[][][] = backward (_s1, _s2);
    double stringProb = bMatrix[0][0][0];
//  NB: b[0][0][0]must be equal to endAtSub*f[l1][l2][0] + endAtGap*(f[l1][l2][1]+f[l1][l2][2]); uncomment below for sanity check
//  double totalForward = logSum(m_endAtSubLogProb + fMatrix[l1][l2][0], m_endAtSubLogProb + fMatrix[l1][l2][1]);
//  totalForward = logSum(totalForward, m_endAtSubLogProb + fMatrix[l1][l2][2]);
//  System.out.println("b:" + bMatrix[0][0][0] + "\tf:" + totalForward);    

    double occsSubst, occsStartGap_1, occsStartGap_2, occsExtendGap_1, occsExtendGap_2;
    double occsEndGap_1, occsEndGap_2;
    double sub_pairProb, ins_charProb, del_charProb;
    char s1_i, s2_j;

    if (stringProb == 0.0) {
      System.out.println("TROUBLE!!!!  s1=" + _s1 + " s2=" + _s2);
      printMatrices(_s1,_s2);
      return 0;
    }
    m_endAtSubOccs += lambda; 
    m_endAtGapOccs += 2*lambda;

    for (int i = 1; i < l1; i++) {
      for (int j = 1; j< l2; j++) {
	s1_i = s1[i-1];
	s2_j = s2[j-1];
	if (s1_i == s2_j) {
	  sub_pairProb = m_noopLogProb;
	} else {
	  sub_pairProb = m_editopLogProbs[s1_i][s2_j];
	}
	ins_charProb = m_editopLogProbs[blank][s1_i]; 
	del_charProb = m_editopLogProbs[blank][s2_j];

	// substituting or matching
	occsSubst = Math.exp(fMatrix[i-1][j-1][0] + sub_pairProb + m_subLogProb + bMatrix[i][j][0] - stringProb);
	if (s1_i == s2_j) {
	  m_noopOccs += occsSubst;
	} else { 
	  m_editopOccs[s1_i][s2_j] +=occsSubst;
	}
	m_subOccs += occsSubst;
		
	// starting a gap
	occsStartGap_1 = Math.exp(fMatrix[i-1][j][0] + ins_charProb + m_gapStartLogProb + bMatrix[i][j][1]-stringProb);
	m_editopOccs[blank][s1_i] += occsStartGap_1;
	occsStartGap_2 = Math.exp(fMatrix[i][j-1][0] + del_charProb + m_gapStartLogProb + bMatrix[i][j][2]-stringProb);
	m_editopOccs[blank][s2_j] += occsStartGap_2;
	m_gapStartOccs += occsStartGap_1 + occsStartGap_2;
		
	// extendinuing a gap     
	occsExtendGap_1 = Math.exp(fMatrix[i-1][j][1] + ins_charProb + m_gapExtendLogProb + bMatrix[i][j][1]-stringProb);
	m_editopOccs[blank][s1_i] += occsExtendGap_1;
	occsExtendGap_2 = Math.exp(fMatrix[i][j-1][2] + del_charProb + m_gapExtendLogProb + bMatrix[i][j][2]-stringProb);
	m_editopOccs[blank][s2_j] += occsExtendGap_2;
	m_gapExtendOccs += occsExtendGap_1 + occsExtendGap_2;
		
	// ending a gap
	occsEndGap_1 = Math.exp(fMatrix[i-1][j-1][1] + sub_pairProb + m_gapEndLogProb + bMatrix[i][j][0] - stringProb);
	if (s1_i == s2_j) {
	  m_noopOccs += occsEndGap_1;
	} else { 
	  m_editopOccs[s1_i][s2_j] += occsEndGap_1;
	}
	occsEndGap_2 = Math.exp(fMatrix[i-1][j-1][2] + sub_pairProb + m_gapEndLogProb + bMatrix[i][j][0] - stringProb);
	if (s1_i == s2_j) {
	  m_noopOccs += occsEndGap_2;
	} else { 
	  m_editopOccs[s1_i][s2_j] += occsEndGap_2;
	}
	m_gapEndOccs += occsEndGap_1 + occsEndGap_2;
      }
    }
    // border rows.  We can't end gap, and can start/extend gap only one way
    for (int i = 1; i < l1; i++) {
      s1_i = s1[i-1];
      s2_j = s2[l2-1];
      ins_charProb = m_editopLogProbs[blank][s1_i];
      if (s1_i == s2_j) {
	sub_pairProb = m_noopLogProb;
      } else {
	sub_pairProb = m_editopLogProbs[s1_i][s2_j];
      }

      occsSubst = Math.exp(fMatrix[i-1][l2-1][0] + sub_pairProb + m_subLogProb + bMatrix[i][l2][0] - stringProb);
      if (s1_i == s2_j) {
	m_noopOccs += occsSubst;
      } else { 
	m_editopOccs[s1_i][s2_j] += occsSubst;
      }
      m_subOccs += occsSubst;
		
      occsStartGap_1 = Math.exp(fMatrix[i-1][l2][0] + ins_charProb + m_gapStartLogProb + bMatrix[i][l2][1] - stringProb);
      m_editopOccs[blank][s1_i] += occsStartGap_1;
      m_gapStartOccs += occsStartGap_1;

      occsExtendGap_1 = Math.exp(fMatrix[i-1][l2][1] + ins_charProb + m_gapExtendLogProb + bMatrix[i][l2][1] - stringProb);
      m_editopOccs[blank][s1_i] += occsExtendGap_1;
      m_gapExtendOccs += occsExtendGap_1;   //  DO WE NEED THIS??? WE HAD NO CHOICE!
    }
    for (int j = 1; j < l2; j++) {
      s1_i = s1[l1-1];
      s2_j = s2[j-1];
      del_charProb = m_editopLogProbs[blank][s2_j];
      if (s1_i == s2_j) {
	sub_pairProb = m_noopLogProb;
      } else {
	sub_pairProb = m_editopLogProbs[s1_i][s2_j];
      }

      occsSubst = Math.exp(fMatrix[l1-1][j-1][0] + sub_pairProb + m_subLogProb + bMatrix[l1][j][0] - stringProb);
      if (s1_i == s2_j) {
	m_noopOccs += occsSubst;
      } else { 
	m_editopOccs[s1_i][s2_j] += occsSubst;
      }
      m_subOccs += occsSubst;
	    
      occsStartGap_2 = Math.exp(fMatrix[l1][j-1][0] + del_charProb + m_gapStartLogProb + bMatrix[l1][j][2] - stringProb);
      m_editopOccs[blank][s2_j] += occsStartGap_2;
      m_gapStartOccs += occsStartGap_2;
	    
      occsExtendGap_2 = Math.exp(fMatrix[l1][j-1][2] + del_charProb + m_gapExtendLogProb + bMatrix[l1][j][2] - stringProb);
      m_editopOccs[blank][s2_j] += occsExtendGap_2;
      m_gapExtendOccs += occsExtendGap_2;  //   DO WE NEED THIS??? WE HAD NO CHOICE!
    }
    return stringProb;
  }

  /** 
   *  Maximization step of the EM algorithm
   */
  protected void maximizationStep () {
    double N, N_s, N_id;

    // TODO:  when trying to incorporate discriminative training, see EditDistance.java
    // in old codebase for an earlier attempt to deal with negative expectations

    // Sum up expectations for transitions in substitution state
    N = m_subOccs + 2*m_gapStartOccs + m_endAtSubOccs;
    m_subProb = m_subOccs / N;
    m_gapStartProb = m_gapStartOccs / N;
    m_endAtSubProb = m_endAtSubOccs / N;
    if (m_subProb < m_clampProb) m_subProb = m_clampProb;
    if (m_gapStartProb < m_clampProb) m_gapStartProb = m_clampProb;
    if (m_endAtSubProb < m_clampProb) m_endAtSubProb = m_clampProb;

    // Sum up expectations for occurrences in deletion/insertion states
    N = m_gapExtendOccs + m_gapEndOccs + m_endAtGapOccs;
    m_gapExtendProb = m_gapExtendOccs / N;
    m_gapEndProb = m_gapEndOccs / N;
    m_endAtGapProb = m_endAtGapOccs / N;
    if (m_gapExtendProb < m_clampProb) m_gapExtendProb = m_clampProb;
    if (m_gapEndProb < m_gapEndProb) m_gapEndProb = m_clampProb;
    if (m_endAtGapProb < m_endAtGapProb) m_endAtGapProb = m_clampProb;
	
    // Now let's add up expectations for actual edit operators
    // we add up separately for substitution and deletion/insertion
    N_s = m_noopOccs;
    N_id = 0;
    for (int i = 0; i < m_usedChars.length; i++) {
      N_id += m_editopOccs[blank][m_usedChars[i]];
      for (int j = 0; j < m_usedChars.length; j++) {
	if (i != j) {
	  N_s += m_editopOccs[m_usedChars[i]][m_usedChars[j]];
	} 
      }
    }

    // Recalculate the probabilities
    m_noopProb = m_noopOccs / N_s;
    for (int i = 0; i < m_usedChars.length; i++) {
      m_editopProbs[blank][m_usedChars[i]] = Math.max(m_clampProb, m_editopOccs[blank][m_usedChars[i]] / N_id);
      for (int j = i+1; j < m_usedChars.length; j++) {
	m_editopProbs[m_usedChars[i]][m_usedChars[j]] = Math.max(m_clampProb, 
	  (m_editopOccs[m_usedChars[i]][m_usedChars[j]] + m_editopOccs[m_usedChars[j]][m_usedChars[i]])/ N_s);
	m_editopProbs[m_usedChars[j]][m_usedChars[i]] = m_editopProbs[m_usedChars[i]][m_usedChars[j]];
      }
    }
    normalizeTransitionProbs();
    normalizeEmissionProbs();
    updateLogProbs();
  }
    

  /** 
   * Normalize the probabilities of emission editops so that they sum to 1
   * for each state
   */
  protected void normalizeEmissionProbs() {
    double N_s = m_noopProb, N_id = 0;
    for (int i = 0; i < m_usedChars.length; i++) {
      N_id += m_editopProbs[blank][m_usedChars[i]];
      for (int j = i+1; j < m_usedChars.length; j++) {
	N_s += m_editopProbs[m_usedChars[i]][m_usedChars[j]];
      }
    }
    // Recalculate the probabilities
    m_noopProb = m_noopProb / N_s;
    for (int i = 0; i < m_usedChars.length; i++) {
      m_editopProbs[blank][m_usedChars[i]] /= N_id;
      for (int j = i+1; j < m_usedChars.length; j++) {
	m_editopProbs[m_usedChars[i]][m_usedChars[j]] /= N_s;
	m_editopProbs[m_usedChars[j]][m_usedChars[i]] = m_editopProbs[m_usedChars[i]][m_usedChars[j]];
      }
    }
  }

  /** 
   * Normalize the probabilities of transitions so that they sum to 1
   * for each state
   */
  protected void normalizeTransitionProbs() {
    double P;

    // M-state
    P = m_subProb + 2 * m_gapStartProb + m_endAtSubProb;
    m_subProb /= P;
    m_gapStartProb /= P;
    m_endAtSubProb /= P;

    // I/D states
    P = m_gapExtendProb + m_gapEndProb + m_endAtGapProb;
    m_gapExtendProb /= P;
    m_gapEndProb /= P;
    m_endAtGapProb /= P;
  }
    
  /** 
   *  reset the number of occurrences of all ops in the set
   */
  protected void resetOccurrences () {
    m_noopOccs = 0;
    m_endAtSubOccs = 0;
    m_endAtGapOccs = 0;
    m_gapStartOccs = 0;
    m_gapExtendOccs = 0;
    m_gapEndOccs = 0;
    m_subOccs = 0;
    for (int i = 0; i < m_usedChars.length; i++) {
      m_editopOccs[blank][m_usedChars[i]] = 0;
      for (int j = 0; j < m_usedChars.length; j++) {
	m_editopOccs[m_usedChars[i]][m_usedChars[j]] = 0;
      }
    }
  }

  /** 
   *  initialize the probabilities to some startup values
   */
  protected void initProbs () {
    double probDeletionUniform = 1.0 / m_usedChars.length;
    double probSubUniform = probDeletionUniform * probDeletionUniform;
    m_endAtSubProb = 0.05;
    m_endAtGapProb = 0.1;
    m_gapStartProb = 0.05;
    m_gapExtendProb = 0.5;
    m_gapEndProb = 0.4;
    m_subProb = 0.9;
    m_noopProb = 0.9;
//      m_endAtSubProb = 0.1;
//      m_endAtGapProb = 0.1;
//      m_gapStartProb = 0.4;
//      m_gapExtendProb = 0.6;
//      m_gapEndProb = 0.3;
//      m_subProb = 0.5;
//      m_noopProb = 0.3;
    for (int i = 0; i < m_usedChars.length; i++) {
      m_editopProbs[blank][m_usedChars[i]] = probDeletionUniform;
      for (int j = 0; j < m_usedChars.length; j++) {
	m_editopProbs[m_usedChars[i]][m_usedChars[j]] = probSubUniform;
      }
    }
  }

  /** 
   *  initialize the costs using current values of the probabilities
   */
  protected void initCosts () {
    m_gapStartCost =  -m_gapStartLogProb;
    m_gapExtendCost =   -m_gapExtendLogProb;
    m_endAtSubCost = -m_endAtSubLogProb;
    m_endAtGapCost = -m_endAtGapLogProb;
    m_gapEndCost =    -m_gapEndLogProb;
    m_subCost =        -m_subLogProb;
    m_noopCost =     -m_noopLogProb;

    if (m_verbose) {
      System.out.println("\nScaled by extend cost:\nGapStrt=" + (m_gapStartCost/m_gapExtendCost) +
			 "\tGapExt=" + (m_gapExtendCost/m_gapExtendCost) +
			 "\tGapEnd=" + (m_gapEndCost/m_gapExtendCost) + 
			 "\tSub=" + (m_subCost/m_gapExtendCost) +
			 "\tNoop=" + (m_noopCost/m_gapExtendCost));
      System.out.println("\nActual costs:\nGapStrt=" + (m_gapStartCost) +
			 "\tGapExt=" + (m_gapExtendCost) +
			 "\tGapEnd=" + (m_gapEndCost) + 
			 "\tSub=" + (m_subCost) +
			 "\tNoop=" + (m_noopCost));

    }
    if (!m_useGenerativeModel) {
      for (int i = 0; i < m_usedChars.length; i++) {
	m_editopCosts[blank][m_usedChars[i]] =   -m_editopLogProbs[blank][m_usedChars[i]];
	for (int j = 0; j < m_usedChars.length; j++) {
	  m_editopCosts[m_usedChars[i]][m_usedChars[j]] =
	    -m_editopLogProbs[m_usedChars[i]][m_usedChars[j]];
	}
      }
    } else {
      m_editopCosts = new double[128][128];
    }
  }

  /**
   * store logs of all probabilities in m_editopLogProbs
   */
  protected void updateLogProbs() {
    for (int i = 0; i < 128; i++) {
      for (int j = 0; j < 128; j++) {
	m_editopLogProbs[i][j] = (m_editopProbs[i][j] == 0) ? -Double.MAX_VALUE : Math.log(m_editopProbs[i][j]);
      }
    }
    m_noopLogProb = Math.log(m_noopProb);
    m_endAtSubLogProb = Math.log(m_endAtSubProb);
    m_endAtGapLogProb = Math.log(m_endAtGapProb);
    m_gapStartLogProb = Math.log(m_gapStartProb);
    m_gapExtendLogProb = Math.log(m_gapExtendProb);
    m_gapEndLogProb = Math.log(m_gapEndProb);
    m_subLogProb = Math.log(m_subProb);

    DecimalFormat fmt = new DecimalFormat ("0.0000");
    System.out.println("After update:\tNOOP=" + fmt.format(m_noopProb) + "=" + fmt.format(m_noopLogProb) +
		       "\tSUB=" + fmt.format(m_subProb) + "=" + fmt.format(m_subLogProb) +
		       "\n\t\tGAPst=" + fmt.format(m_gapStartProb) + "=" + fmt.format(m_gapStartLogProb) +
		       "\tGAPcont=" + fmt.format(m_gapExtendProb) + "=" + fmt.format(m_gapExtendLogProb) +
		       "\tGAPend=" + fmt.format(m_gapEndProb) + "=" + fmt.format(m_gapEndLogProb) +
		       "\n\t\tendAtGap=" + fmt.format(m_endAtGapProb) + "=" + fmt.format(m_endAtGapLogProb) +
		       "\tendAtSub=" + fmt.format(m_endAtSubProb) + "=" + fmt.format(m_endAtSubLogProb));
  }

    
  /** 
   * Get the distance between two strings
   * @param s1 first string
   * @param s2 second string
   * @return a value of this distance between these two strings
   */
  public double distance (String s1, String s2) {
    if (m_useGenerativeModel) {
      double d = backward(s1,s2)[0][0][0];
      if (m_normalized) {
	//	for (int i = 0; i < (s1.length() + s2.length()); i++) 
	  // TODO:  fix the posteriors; don't care for now - we always use the additive model
	//	  d -= m_noopLogProb + m_subLogProb;
	//  		for (int i = 0; i < s1.length(); i++) {
	//  		    d -= m_editopLogProbs[blank][s1.charAt(i)];
	//  		}
	//  		for (int i = 0; i < s2.length(); i++) {
	//  		    d -= m_editopLogProbs[blank][s2.charAt(i)];
	//  		}
      }
      return -d;
    } else {
      return costDistance(s1, s2);
    }
  }


  /** Method:  recordCosts
      Record probability matrix for further MatLab use
  */
  void recordCosts(int id) {
    try {
      FileOutputStream fstr = new FileOutputStream ("/tmp/probs/ProbAffineCosts.txt", true);
      DataOutputStream out = new DataOutputStream (fstr);
      char s, t;
      DecimalFormat fmt = new DecimalFormat ("0.00");
	    
      out.writeBytes(id + " " + m_noopCost + "\n" + "char\tblank   ");
      for (int i = 0; i < m_usedChars.length; i++) {
	out.writeBytes("\'" + m_usedChars[i] + "\'" + "\t");
      }
      out.writeBytes("\n");
      for (int i = 0; i < m_usedChars.length; i++) {
	out.writeBytes("\'" + m_usedChars[i] + "\'" + "\t" +
		       fmt.format(m_editopCosts[blank][m_usedChars[i]]) + "\t"); 
	for (int j = 0; j < i; j++) {
	  out.writeBytes(fmt.format(m_editopCosts[m_usedChars[i]][m_usedChars[j]]) + "\t");
	}
	out.writeBytes("\n");
      }
      out.close();
    } catch (Exception x) {}
  }
	    

  static String MatrixToString (double matrix[][]) {
    DecimalFormat fmt = new DecimalFormat ("0.00");
    String s = "";
    for (int i = 0; i < matrix.length; i++) {
      for (int j = 0; j < matrix[0].length; j++)
	s = s + fmt.format(matrix[i][j]) + "  ";
      s = s + "\n";
    }
    return s;
  }

  static String intMatrixToString (int matrix[][]) {
    String s = "";
    for (int i = 0; i < matrix.length; i++) {
      for (int j = 0; j < matrix[0].length; j++)
	s = s + matrix[i][j] + "  ";
      s = s + "\n";
    }
    return s;
  }

  static String doubleMatrixToString (double matrix[][]) {
    String s = "";
    java.text.DecimalFormat de = new java.text.DecimalFormat("0.0E000");
    for (int i = 0; i < matrix.length; i++) {
      for (int j = 0; j < matrix[0].length; j++)
	s = s + de.format(matrix[i][j]) + "  ";
      s = s + "\n";
    }
    return s;
  }

  static String doubleMatrixToString0 (double matrix[][][], int k) {
    String s;
    s = "";
    java.text.DecimalFormat de = new java.text.DecimalFormat("0.0E000");
    for (int i = 0; i < matrix.length; i++) {
      for (int j = 0; j < matrix[0].length; j++)
	s = s + de.format(matrix[i][j][k]) + "  ";
      s = s + "\n";
    }
    return s;
  }

  static String charMatrixToString (char matrix[][]) {
    String s = "";
    for (int i = 0; i < matrix.length; i++) {
      for (int j = 0; j < matrix[0].length; j++)
	s = s + matrix[i][j] + "  ";
      s = s + "\n";
    }
    return s;
  }


  /** Calculation of log(a+b) with a correction for machine precision
   * @param _a number log(a)
   * @param _b number log(b)
   * @returns log(a+b)
   */
  protected double logSum(double _logA, double _logB) {
    double logSum = 0;
    // make logA the smaller of the two
    double logA = (_logA < _logB) ? _logA : _logB;
    double logB = (_logA < _logB) ? _logB : _logA;
    
    if (logA - logB < -324 || logA == Double.NEGATIVE_INFINITY) {
      logSum = logB;
    } else {
      logSum = logA + Math.log(1 + Math.exp(logB - logA));
    }
    return logSum;
  }

  /**
   * Calculate affine gapped distance using learned costs
   * @param s1 first string
   * @param s2 second string
   * @return minimum number of deletions/insertions/substitutions to be performed
   * to transform s1 into s2 (or vice versa)
   */
  public double costDistance(String string1, String string2) {
    char [] s1 = string1.toLowerCase().toCharArray();
    char [] s2 = string2.toLowerCase().toCharArray();
    int l1 = s1.length, l2 = s2.length;
    double T[][] = new double[l1+1][l2+1];
    double I[][] = new double[l1+1][l2+1];
    double D[][] = new double[l1+1][l2+1];
    double subCost = 0, sub_charCost = 0, ins_charCost = 0, del_charCost = 0, ret;
    int i, j;

    if (l1==0 || l2==0) {
      return m_gapStartCost + (l1+l2-1) * m_gapExtendCost;
    }
    for (j = 0; j < l2+1; j++) {
      I[0][j] = Double.MAX_VALUE;
      D[0][j] = Double.MAX_VALUE;
    }
    for (j = 0; j < l1+1; j++) {
      I[j][0] = Double.MAX_VALUE;
      D[j][0] = Double.MAX_VALUE;
    }
    T[0][0] = 0;
    T[0][1] = m_gapStartCost;
    T[1][0] = m_gapStartCost;
    for (j = 2; j < l2+1; j++) {
      ins_charCost = m_editopCosts[blank][s2[j-1]];
      T[0][j] = T[0][j-1] + m_gapExtendCost + ins_charCost;
    }
    for (j = 2; j < l1+1; j++) {
      del_charCost =  m_editopCosts[blank][s1[j-1]];
      T[j][0] = T[j-1][0] + m_gapExtendCost + del_charCost;
    }
    for (i = 1; i < l1+1; i++) {
      for (j = 1; j < l2+1; j++) {
	char c1 = s1[i-1];
	char c2 = s2[j-1];
	del_charCost = m_editopCosts[blank][c1];
	ins_charCost = m_editopCosts[blank][c2];
	sub_charCost = (c1 == c2) ? m_noopCost : m_editopCosts[c1][c2];
	sub_charCost = (c1 == c2) ? 0 : m_editopCosts[c1][c2];  //  ??  do we use noopCost?

	if (D[i-1][j]+m_gapExtendCost > T[i-1][j]+m_gapStartCost) {
	  D[i][j] = T[i-1][j]+m_gapStartCost + del_charCost;
	} else {
	  D[i][j] = D[i-1][j]+m_gapExtendCost + del_charCost;
	}
		
	if (I[i][j-1]+m_gapExtendCost > T[i][j-1]+m_gapStartCost) {
	  I[i][j] = T[i][j-1] + m_gapStartCost + ins_charCost;
	} else {
	  I[i][j] = I[i][j-1] + m_gapExtendCost + ins_charCost;
	}
		
	//subCost = m_subCost + sub_charCost;
	subCost =((c1 == c2) ? 0 : (m_subCost + m_gapEndCost));
	subCost = subCost + sub_charCost;
  	subCost = (c1 == c2) ? 0 : m_subCost;
	//	subCost = m_subCost;
		
	if  ((T[i-1][j-1] + subCost < D[i-1][j-1] + m_gapEndCost) &&    /// d[i][j] or d[i-1][j-1]??
	     (T[i-1][j-1] + subCost < I[i-1][j-1] + m_gapEndCost )) {
	  T[i][j] = T[i-1][j-1] + subCost + sub_charCost;   // ?? do we add subCharCost?
	} else {
	  if (D[i-1][j-1] < I[i-1][j-1]) {
	    T[i][j] = D[i-1][j-1] + m_gapEndCost + sub_charCost;
	  } else {
	    T[i][j] = I[i-1][j-1] + m_gapEndCost + sub_charCost;
	  }
	}
      }
    }
	
    if (T[l1][l2] < D[l1][l2] && T[l1][l2] < I[l1][l2]) {
      ret = T[l1][l2];
    } else if (D[l1][l2] < I[l1][l2]) {
      ret = D[l1][l2];
    } else {
      ret = I[l1][l2];
    }
    if (m_normalized) {
//        // get the normalization factor as P(x,y)=P(x)P(y)
//        double Pxy = 2 * m_gapStartCost; 
//        for (int k = 0; k < l1; k++) {
//  	Pxy += s1[k] + m_gapExtendCost;
//        }
//        for (int k = 0; k < l2; k++) {
//  	Pxy += s2[k] + m_gapExtendCost;
//        }
//        ret /= Pxy;
      ret /= 4*(l1 + l2);
    }
    return ret;
  }

  public static void print3dMatrix(double [][][] matrix) {
    DecimalFormat fmt = new DecimalFormat ("0.0000E00");
    for (int i = 0; i < matrix[0][0].length; i++) {
      System.out.println ("\nMatrix[][][" + i + "]");
      for (int j = 0; j < matrix[0].length; j++) {
	for (int k = 0; k < matrix.length; k++) {
	  System.out.print(fmt.format(matrix[k][j][i]) + "\t");
	}
	System.out.println();
      }
    }
  }

  /** Set the distance to be normalized by the sum of the string's lengths
   * @param normalized if true, distance is normalized by the sum of string's lengths
   */
  public void setNormalized(boolean normalized) {
    m_normalized = normalized;
  } 

  /** Get whether the distance is normalized by the sum of the string's lengths
   * @return if true, distance is normalized by the sum of string's lengths
   */
  public boolean getNormalized() {
    return m_normalized;
  }


  /** Set the distance to use the generative model or convert back to the additive model
   * @param useGenerativeModel if true, the generative model is used
   */
  public void setUseGenerativeModel(boolean useGenerativeModel) {
    m_useGenerativeModel = useGenerativeModel;
  } 


  /** Do we use the generative model or convert back to the additive model?
   * @param useGenerativeModel if true, the generative model is used
   */
  public boolean getUseGenerativeModel() {
    return m_useGenerativeModel;
  } 

  /** Set the clamping probability value
   * @param clampProb a lower bound for all probability values to prevent underflow
   */
  public void setClampProb(double clampProb) {
    m_clampProb = clampProb;
  }
  
  /** Get the clamping probability value
   * @return a lower bound for all probability values to prevent underflow
   */
  public double getClampProb() {
    return m_clampProb;
  }

  /** Set the number of training iterations
   * @param numIterations the number of iterations
   */
  public void setNumIterations(int numIterations) {
    m_numIterations = numIterations;
  } 

  /** Get the number of training iterations
   * @return the number of training iterations
   */
  public int setNumIterations() {
    return m_numIterations;
  } 


  /** Create a copy of this metric
   * @return another AffineMetric with the same exact parameters as this  metric
   */
  public Object clone() {
    AffineProbMetric metric = new AffineProbMetric();
    metric.setNormalized(m_normalized);
    metric.setUseGenerativeModel(m_useGenerativeModel);
    metric.setClampProb(m_clampProb);
    metric.setNumIterations(m_numIterations);
    return metric;
  }


    /**
   * Gets the current settings of WeightedDotP.
   *
   * @return an array of strings suitable for passing to setOptions()
   * TODO!!!!
   */
  public String [] getOptions() {
    String [] options = new String [10];
    int current = 0;

    if (m_normalized) {
      options[current++] = "-N";
    }

    if (m_useGenerativeModel) {
      options[current++] = "-G";
    } else {
      options[current++] = "-A";
    }

    options[current++] = "-c";
    options[current++] = "" + m_clampProb;

    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }


  /**
   * Parses a given list of options. Valid options are:<p>
   *
   * -N normalize by length
   * -m matchCost
   * -s subCost
   * -g gapStartCost
   * -e gapExtendCost   
   */
  public void setOptions(String[] options) throws Exception {
    setNormalized(Utils.getFlag('N', options));
   
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   * TODO!!!
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(5);

    newVector.addElement(new Option("\tNormalize by lengths\n",
				    "N", 0, "-N"));

    
    return newVector.elements();
  }
  
  

  /** The computation of a metric can be either based on distance, or on similarity
   * @returns true
   */
  public boolean isDistanceBased() {
    return true;
  }


  /**
   * Returns a similarity estimate between two strings. Similarity is obtained by
   * inverting the distance value using one of three methods:
   * CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT.
   * @param string1 First string.
   * @param string2 Second string.
   * @exception Exception if similarity could not be estimated.
   */
  public double similarity(String string1, String string2) throws Exception {
    switch (m_conversionType) {
    case CONVERSION_LAPLACIAN: 
      return 1 / (1 + distance(string1, string2));
    case CONVERSION_UNIT:
      return 2 * (1 - distance(string1, string2));
    case CONVERSION_EXPONENTIAL:
      return Math.exp(-distance(string1, string2));
    default:
      throw new Exception ("Unknown distance to similarity conversion method");
    }
  }

  /**
   * Set the type of similarity to distance conversion. Values other
   * than CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL will be ignored
   * 
   * @param type type of the similarity to distance conversion to use
   */
  public void setConversionType(SelectedTag conversionType) {
    if (conversionType.getTags() == TAGS_CONVERSION) {
      m_conversionType = conversionType.getSelectedTag().getID();
    }
  }

  /**
   * return the type of similarity to distance conversion
   * @return one of CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL
   */
  public SelectedTag getConversionType() {
    return new SelectedTag(m_conversionType, TAGS_CONVERSION);
  }


    
  public static void main(String[] args) {
    try { 
    AffineProbMetric metric = new AffineProbMetric();
    //    metric.trainMetric(new ArrayList());
    String s1 = new String("abcde");
    String s2 = new String("ab");
    metric.printMatrices(s1, s2);
    } catch (Exception e) { e.printStackTrace();}
  }
    
}