/************************************************************************
* Copyright (C) 2006-2007 The University of Sheffield *
* Developed by Mark A. Greenwood <m.greenwood@dcs.shef.ac.uk> *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the Free Software *
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *
************************************************************************/
package edu.isistan.uima.unified.algorithms.similarity;
import net.didion.jwnl.JWNLException;
import net.didion.jwnl.data.Synset;
/**
* An implementation of the WordNet similarity measure developed by Jiang and
* Conrath. For full details of the measure see:
* <blockquote>Jiang J. and Conrath D. 1997. Semantic similarity based on corpus
* statistics and lexical taxonomy. In Proceedings of International
* Conference on Research in Computational Linguistics, Taiwan.</blockquote>
* @author Mark A. Greenwood
*/
public class JCn extends ICMeasure
{
/**
* Instances of this similarity measure should be generated using the
* factory methods of {@link SimilarityMeasure}.
*/
protected JCn()
{
//A protected constructor to force the use of the newInstance method
}
@Override public double getSimilarity(Synset s1, Synset s2) throws JWNLException
{
//if the POS tags are not the same then return 0 as this measure
//only works with 2 nouns or 2 verbs.
if (!s1.getPOS().equals(s2.getPOS())) return 0;
//see if the similarity is already cached and...
Double cached = getFromCache(s1, s2);
//if it is then simply return it
if (cached != null) return cached.doubleValue();
//Get the Information Content (IC) values for the two supplied synsets
double ic1 = getIC(s1);
double ic2 = getIC(s2);
//if either IC value is zero then cache and return a sim of 0
if (ic1 == 0 || ic2 == 0) return addToCache(s1,s2,0);
//Get the Lowest Common Subsumer (LCS) of the two synsets
Synset lcs = getLCSbyIC(s1,s2);
//if there isn't an LCS then cache and return a sim of 0
if (lcs == null) return addToCache(s1,s2,0);
//get the IC valueof the LCS
double icLCS = getIC(lcs);
//compute the distance between the two synsets
//NOTE: This is the original JCN measure
double distance = ic1 + ic2 - (2 * icLCS);
//assume the similarity between the synsets is 0
double sim = 0;
if (distance == 0)
{
//if the distance is 0 (i.e. ic1 + ic2 = 2 * icLCS) then...
//get the root frequency for this POS tag
double rootFreq = getFrequency(s1.getPOS());
if (rootFreq > 0.01)
{
//if the root frequency has a value then use it to generate a
//very large sim value
sim = 1/-Math.log((rootFreq - 0.01) / rootFreq);
}
}
else
{
//this is the normal case so just convert the distance
//to a similarity by taking the multiplicative inverse
sim = 1/distance;
}
//cache and return the calculated similarity
return addToCache(s1,s2,sim);
}
}