package edu.northwestern.at.utils.corpuslinguistics.stringsimilarity; /* Please see the license information in the header below. */ /** Implementation of the Longest Common Subsequence algorithm. * * <p> * That is, given two strings A and B, this program will find the longest sequence * of letters that are common and ordered in A and B. * </p> * * <p> * There are only two reasons you are reading this: * </p> * * <ul> * <li>you don't care what the algorithm is but you need a piece of code * to do it</li> * <li>you're trying to understand the algorithm, and a piece of code * might help</li> * </ul> * * <p> * In either case, you should either read an entire chapter of an * algorithms textbook on the subject of dynamic programming, or you * should consult a webpage that describes this particular algorithm. * It is important, for example, that we use arrays of size * |A|+1 x |B|+1. * </p> * * <p> * This code is provided AS-IS. You may use this code in any way you see * fit, EXCEPT as the answer to a homework problem or as part of a term * project in which you were expected to arrive at this code yourself. * </p> * * <p> * Copyright (C) 2005 Neil Jones. * </p> * * <p> * Similarity computation added by Phiip R. Burns. 2007/10/24. * </p> */ public class LCS implements StringSimilarity { /** "constants" which indicate a direction in the backtracking array. */ private static final int NEITHER = 0; private static final int UP = 1; private static final int LEFT = 2; private static final int UP_AND_LEFT = 3; /** Create LCS instance. */ public LCS() { } public static String LCSAlgorithm( String a , String b ) { int n = a.length(); int m = b.length(); int S[][] = new int[n+1][m+1]; int R[][] = new int[n+1][m+1]; int ii, jj; // It is important to use <=, not <. The next two for-loops are initialization for(ii = 0; ii <= n; ++ii) { S[ii][0] = 0; R[ii][0] = UP; } for(jj = 0; jj <= m; ++jj) { S[0][jj] = 0; R[0][jj] = LEFT; } // This is the main dynamic programming loop that computes the score and // backtracking arrays. for(ii = 1; ii <= n; ++ii) { for(jj = 1; jj <= m; ++jj) { if( a.charAt(ii-1) == b.charAt(jj-1) ) { S[ii][jj] = S[ii-1][jj-1] + 1; R[ii][jj] = UP_AND_LEFT; } else { S[ii][jj] = S[ii-1][jj-1] + 0; R[ii][jj] = NEITHER; } if ( S[ii-1][jj] >= S[ii][jj] ) { S[ii][jj] = S[ii-1][jj]; R[ii][jj] = UP; } if ( S[ii][jj-1] >= S[ii][jj] ) { S[ii][jj] = S[ii][jj-1]; R[ii][jj] = LEFT; } } } // The length of the longest substring is S[n][m] ii = n; jj = m; int pos = S[ii][jj] - 1; char lcs[] = new char[ pos+1 ]; // Trace the backtracking matrix. while( ii > 0 || jj > 0 ) { if( R[ii][jj] == UP_AND_LEFT ) { ii--; jj--; lcs[pos--] = a.charAt(ii); } else if( R[ii][jj] == UP ) { ii--; } else if( R[ii][jj] == LEFT ) { jj--; } } return new String( lcs ); } /** Compute similarity of two strings using longest common subsequence. * * @param s1 First string. * @param s2 Second string. * * @return Similarity measure in the range [0,1] . */ public static double lcsSimilarity( String s1 , String s2 ) { double result = 0.0D; if ( ( s1 != null ) && ( s2 != null ) ) { double dl = s1.length() + s2.length(); if ( dl > 0.0D ) { result = 2.0D * LCSAlgorithm( s1 , s2 ).length() / dl; } } return result; } /** Compute similarity of two strings using longest common subsequence. * * @param s1 First string. * @param s2 Second string. * * @return Similarity measure in the range [0,1] . */ public double similarity( String s1 , String s2 ) { return lcsSimilarity( s1 , s2 ); } }