/************************************************************************
* Copyright (C) 2006-2007 The University of Sheffield *
* Developed by Mark A. Greenwood <m.greenwood@dcs.shef.ac.uk> *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the Free Software *
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *
************************************************************************/
package edu.isistan.uima.unified.algorithms.similarity;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import net.didion.jwnl.JWNLException;
import net.didion.jwnl.data.IndexWord;
import net.didion.jwnl.data.POS;
import net.didion.jwnl.data.Synset;
import net.didion.jwnl.dictionary.Dictionary;
/**
* An abstract notion of a similarity measure that all provided implementations
* extend.
*
* @author Mark A. Greenwood
*/
public abstract class SimilarityMeasure {
/**
* A mapping of terms to specific synsets. Usually used to map domain terms
* to a restricted set of synsets but can also be used to map named entity
* tags to appropriate synsets.
*/
private Map<String, Set<Synset>> domainMappings = new HashMap<String, Set<Synset>>();
/**
* The maximum size the cache can grow to
*/
private int cacheSize = 5000;
/**
* To speed up computation of the similarity between two synsets we cache
* each similarity that is computed so we only have to do each one once.
*/
@SuppressWarnings("serial")
private Map<String, Double> cache = new LinkedHashMap<String, Double>(16,
0.75f, true) {
public boolean removeEldestEntry(Map.Entry<String, Double> eldest) {
// if the size is less than zero then the user is asking us
// not to limit the size of the cache so return false
if (cacheSize < 0)
return false;
// if the cache has crown bigger than it's max size return true
return size() > cacheSize;
}
};
/**
* Get a previously computed similarity between two synsets from the cache.
*
* @param s1
* the first synset between which we are looking for the
* similarity.
* @param s2
* the other synset between which we are looking for the
* similarity.
* @return The similarity between the two sets or null if it is not in the
* cache.
*/
protected final Double getFromCache(Synset s1, Synset s2) {
return cache.get(s1.getKey() + "-" + s2.getKey());
}
/**
* Add a computed similarity between two synsets to the cache so that we
* don't have to compute it if it is needed in the future.
*
* @param s1
* one of the synsets between which we are storring a similarity.
* @param s2
* the other synset between which we are storring a similarity.
* @param sim
* the similarity between the two supplied synsets.
* @return the similarity score just added to the cache.
*/
protected final double addToCache(Synset s1, Synset s2, double sim) {
cache.put(s1.getKey() + "-" + s2.getKey(), sim);
return sim;
}
/**
* Configures the similarity measure using the supplied parameters.
*
* @param params
* a set of key-value pairs that are used to configure the
* similarity measure. See concrete implementations for details
* of expected/possible parameters.
* @throws Exception
* if an error occurs while configuring the similarity measure.
*/
protected abstract void config(Map<String, String> params) throws Exception;
/**
* Create a new instance of a similarity measure.
*
* @param confURL
* the URL of a configuration file. Parameters are specified one
* per line as key:value pairs.
* @return a new instance of a similairy measure as defined by the supplied
* configuration URL.
* @throws Exception
* if an error occurs while creating the similarity measure.
*/
public static SimilarityMeasure newInstance(URL confURL) throws Exception {
// create map to hold the key-value pairs we are going to read from
// the configuration file
Map<String, String> params = new HashMap<String, String>();
// create a reader for the config file
BufferedReader in = null;
try {
// open the config file
in = new BufferedReader(new InputStreamReader(confURL.openStream()));
String line = in.readLine();
while (line != null) {
line = line.trim();
if (!line.equals("")) {
// if the line contains something then
// split the data so we get the key and value
String[] data = line.split("\\s*:\\s*", 2);
if (data.length == 2) {
// if the line is valid add the two parts to the map
params.put(data[0], data[1]);
} else {
// if the line isn't valid tell the user but continue on
// with the rest of the file
System.out.println("Config Line is Malformed: " + line);
}
}
// get the next line ready to process
line = in.readLine();
}
} finally {
// close the config file if it got opened
if (in != null)
in.close();
}
// create and return a new instance of the similarity measure specified
// by the config file
return newInstance(params);
}
/**
* Creates a new instance of a similarity measure using the supplied
* parameters.
*
* @param params
* a set of key-value pairs which define the similarity measure.
* @return the newly created similarity measure.
* @throws Exception
* if an error occurs while creating the similarity measure.
*/
public static SimilarityMeasure newInstance(Map<String, String> params)
throws Exception {
// get the class name of the implementation we need to load
String name = params.remove("simType");
// if the name hasn't been specified then throw an exception
if (name == null)
throw new Exception("Must specifiy the similarity measure to use");
// Get hold of the class we need to load
@SuppressWarnings("unchecked")
Class<SimilarityMeasure> c = (Class<SimilarityMeasure>) Class
.forName(name);
// create a new instance of the similarity measure
SimilarityMeasure sim = c.newInstance();
// get the cache parameter from the config params
String cSize = params.remove("cache");
// if a cache size was specified then set it
if (cSize != null)
sim.cacheSize = Integer.parseInt(cSize);
// get the url of the domain mapping file
String mapURL = params.remove("mapping");
if (mapURL != null) {
// if a mapping file has been provided then
// open a reader over the file
BufferedReader in = new BufferedReader(new InputStreamReader(
(new URL(mapURL)).openStream()));
// get the first line ready for processing
String line = in.readLine();
while (line != null) {
if (!line.startsWith("#")) {
// if the line isn't a comment (i.e. it doesn't start with
// #) then...
// split the line at the white space
String[] data = line.trim().split("\\s+");
// create a new set to hold the mapped synsets
Set<Synset> mappedTo = new HashSet<Synset>();
for (int i = 1; i < data.length; ++i) {
// for each synset mapped to get the actual Synsets
// and store them in the set
mappedTo.addAll(sim.getSynsets(data[i]));
}
// if we have found some actual synsets then
// store them in the domain mappings
if (mappedTo.size() > 0)
sim.domainMappings.put(data[0], mappedTo);
}
// get the next line from the file
line = in.readLine();
}
// we have finished with the mappings file so close it
in.close();
}
// make sure it is configured properly
sim.config(params);
// then return it
return sim;
}
/**
* This is the method responsible for computing the similarity between two
* specific synsets. The method is implemented differently for each
* similarity measure so see the subclasses for detailed information.
*
* @param s1
* one of the synsets between which we want to know the
* similarity.
* @param s2
* the other synset between which we want to know the similarity.
* @return the similarity between the two synsets.
* @throws JWNLException
* if an error occurs accessing WordNet.
*/
public abstract double getSimilarity(Synset s1, Synset s2)
throws JWNLException;
/**
* Get the similarity between two words. The words can be specified either
* as just the word or in an encoded form including the POS tag and possibly
* the sense number, i.e. cat#n#1 would specifiy the 1st sense of the noun
* cat.
*
* @param w1
* one of the words to compute similarity between.
* @param w2
* the other word to compute similarity between.
* @return a SimilarityInfo instance detailing the similarity between the
* two words specified.
* @throws JWNLException
* if an error occurs accessing WordNet.
*/
public final SimilarityInfo getSimilarity(String w1, String w2)
throws JWNLException {
// Get the (possibly) multiple synsets associated with each word
Set<Synset> ss1 = getSynsets(w1);
Set<Synset> ss2 = getSynsets(w2);
// assume the words are not at all similar
SimilarityInfo sim = null;
for (Synset s1 : ss1) {
for (Synset s2 : ss2) {
// for each pair of synsets get the similarity
double score = getSimilarity(s1, s2);
if (sim == null || score > sim.getSimilarity()) {
// if the similarity is better than we have seen before
// then create and store an info object describing the
// similarity between the two synsets
sim = new SimilarityInfo(w1, s1, w2, s2, score);
}
}
}
// return the maximum similarity we have found
return sim;
}
/**
* Finds all the synsets associated with a specific word.
*
* @param word
* the word we are interested. Note that this may be encoded to
* include information on POS tag and sense index.
* @return a set of synsets that are associated with the supplied word
* @throws JWNLException
* if an error occurs accessing WordNet
*/
private final Set<Synset> getSynsets(String word) throws JWNLException {
// get a handle on the WordNet dictionary
Dictionary dict = Dictionary.getInstance();
// create an emptuy set to hold any synsets we find
Set<Synset> synsets = new HashSet<Synset>();
// split the word on the # characters so we can get at the
// upto three componets that could be present: word, POS tag, sense
// index
String[] data = word.split("#");
// if the word is in the domainMappings then simply return the mappings
if (domainMappings.containsKey(data[0]))
return domainMappings.get(data[0]);
if (data.length == 1) {
// if there is just the word
for (IndexWord iw : dict.lookupAllIndexWords(data[0])
.getIndexWordArray()) {
// for each matching word in WordNet add all it's senses to
// the set we are building up
synsets.addAll(Arrays.asList(iw.getSenses()));
}
// we have finihsed so return the synsets we found
return synsets;
}
// the calling method specified a POS tag as well so get that
POS pos = POS.getPOSForKey(data[1]);
// if the POS tag isn't valid throw an exception
if (pos == null)
throw new JWNLException("Invalid POS Tag: " + data[1]);
// get the word with the specified POS tag from WordNet
IndexWord iw = dict.getIndexWord(pos, data[0]);
if (data.length > 2) {
// if the calling method specified a sense index then
// add just that sysnet to the set we are creating
synsets.add(iw.getSense(Integer.parseInt(data[2])));
} else {
// no sense index was specified so add all the senses of
// the word to the set we are creating
synsets.addAll(Arrays.asList(iw.getSenses()));
}
// return the set of synsets we found for the specified word
return synsets;
}
}