SimilarityMeasure.java example

Explorer
sad-analyzer-master
- SADAnalyzer
/************************************************************************
 *         Copyright (C) 2006-2007 The University of Sheffield          *
 *      Developed by Mark A. Greenwood <m.greenwood@dcs.shef.ac.uk>     *
 *                                                                      *
 * This program is free software; you can redistribute it and/or modify *
 * it under the terms of the GNU General Public License as published by *
 * the Free Software Foundation; either version 2 of the License, or    *
 * (at your option) any later version.                                  *
 *                                                                      *
 * This program is distributed in the hope that it will be useful,      *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of       *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
 * GNU General Public License for more details.                         *
 *                                                                      *
 * You should have received a copy of the GNU General Public License    *
 * along with this program; if not, write to the Free Software          *
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.            *
 ************************************************************************/

package edu.isistan.uima.unified.algorithms.similarity;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;

import net.didion.jwnl.JWNLException;
import net.didion.jwnl.data.IndexWord;
import net.didion.jwnl.data.POS;
import net.didion.jwnl.data.Synset;
import net.didion.jwnl.dictionary.Dictionary;

/**
 * An abstract notion of a similarity measure that all provided implementations
 * extend.
 * 
 * @author Mark A. Greenwood
 */
public abstract class SimilarityMeasure {
	/**
	 * A mapping of terms to specific synsets. Usually used to map domain terms
	 * to a restricted set of synsets but can also be used to map named entity
	 * tags to appropriate synsets.
	 */
	private Map<String, Set<Synset>> domainMappings = new HashMap<String, Set<Synset>>();

	/**
	 * The maximum size the cache can grow to
	 */
	private int cacheSize = 5000;

	/**
	 * To speed up computation of the similarity between two synsets we cache
	 * each similarity that is computed so we only have to do each one once.
	 */
	@SuppressWarnings("serial")
	private Map<String, Double> cache = new LinkedHashMap<String, Double>(16,
			0.75f, true) {
		public boolean removeEldestEntry(Map.Entry<String, Double> eldest) {
			// if the size is less than zero then the user is asking us
			// not to limit the size of the cache so return false
			if (cacheSize < 0)
				return false;

			// if the cache has crown bigger than it's max size return true
			return size() > cacheSize;
		}
	};

	/**
	 * Get a previously computed similarity between two synsets from the cache.
	 * 
	 * @param s1
	 *            the first synset between which we are looking for the
	 *            similarity.
	 * @param s2
	 *            the other synset between which we are looking for the
	 *            similarity.
	 * @return The similarity between the two sets or null if it is not in the
	 *         cache.
	 */
	protected final Double getFromCache(Synset s1, Synset s2) {
		return cache.get(s1.getKey() + "-" + s2.getKey());
	}

	/**
	 * Add a computed similarity between two synsets to the cache so that we
	 * don't have to compute it if it is needed in the future.
	 * 
	 * @param s1
	 *            one of the synsets between which we are storring a similarity.
	 * @param s2
	 *            the other synset between which we are storring a similarity.
	 * @param sim
	 *            the similarity between the two supplied synsets.
	 * @return the similarity score just added to the cache.
	 */
	protected final double addToCache(Synset s1, Synset s2, double sim) {
		cache.put(s1.getKey() + "-" + s2.getKey(), sim);

		return sim;
	}

	/**
	 * Configures the similarity measure using the supplied parameters.
	 * 
	 * @param params
	 *            a set of key-value pairs that are used to configure the
	 *            similarity measure. See concrete implementations for details
	 *            of expected/possible parameters.
	 * @throws Exception
	 *             if an error occurs while configuring the similarity measure.
	 */
	protected abstract void config(Map<String, String> params) throws Exception;

	/**
	 * Create a new instance of a similarity measure.
	 * 
	 * @param confURL
	 *            the URL of a configuration file. Parameters are specified one
	 *            per line as key:value pairs.
	 * @return a new instance of a similairy measure as defined by the supplied
	 *         configuration URL.
	 * @throws Exception
	 *             if an error occurs while creating the similarity measure.
	 */
	public static SimilarityMeasure newInstance(URL confURL) throws Exception {
		// create map to hold the key-value pairs we are going to read from
		// the configuration file
		Map<String, String> params = new HashMap<String, String>();

		// create a reader for the config file
		BufferedReader in = null;

		try {
			// open the config file
			in = new BufferedReader(new InputStreamReader(confURL.openStream()));

			String line = in.readLine();
			while (line != null) {
				line = line.trim();

				if (!line.equals("")) {
					// if the line contains something then

					// split the data so we get the key and value
					String[] data = line.split("\\s*:\\s*", 2);

					if (data.length == 2) {
						// if the line is valid add the two parts to the map
						params.put(data[0], data[1]);
					} else {
						// if the line isn't valid tell the user but continue on
						// with the rest of the file
						System.out.println("Config Line is Malformed: " + line);
					}
				}

				// get the next line ready to process
				line = in.readLine();
			}
		} finally {
			// close the config file if it got opened
			if (in != null)
				in.close();
		}

		// create and return a new instance of the similarity measure specified
		// by the config file
		return newInstance(params);
	}

	/**
	 * Creates a new instance of a similarity measure using the supplied
	 * parameters.
	 * 
	 * @param params
	 *            a set of key-value pairs which define the similarity measure.
	 * @return the newly created similarity measure.
	 * @throws Exception
	 *             if an error occurs while creating the similarity measure.
	 */
	public static SimilarityMeasure newInstance(Map<String, String> params)
			throws Exception {
		// get the class name of the implementation we need to load
		String name = params.remove("simType");

		// if the name hasn't been specified then throw an exception
		if (name == null)
			throw new Exception("Must specifiy the similarity measure to use");

		// Get hold of the class we need to load
		@SuppressWarnings("unchecked")
		Class<SimilarityMeasure> c = (Class<SimilarityMeasure>) Class
				.forName(name);

		// create a new instance of the similarity measure
		SimilarityMeasure sim = c.newInstance();

		// get the cache parameter from the config params
		String cSize = params.remove("cache");

		// if a cache size was specified then set it
		if (cSize != null)
			sim.cacheSize = Integer.parseInt(cSize);

		// get the url of the domain mapping file
		String mapURL = params.remove("mapping");

		if (mapURL != null) {
			// if a mapping file has been provided then

			// open a reader over the file
			BufferedReader in = new BufferedReader(new InputStreamReader(
					(new URL(mapURL)).openStream()));

			// get the first line ready for processing
			String line = in.readLine();

			while (line != null) {
				if (!line.startsWith("#")) {
					// if the line isn't a comment (i.e. it doesn't start with
					// #) then...

					// split the line at the white space
					String[] data = line.trim().split("\\s+");

					// create a new set to hold the mapped synsets
					Set<Synset> mappedTo = new HashSet<Synset>();

					for (int i = 1; i < data.length; ++i) {
						// for each synset mapped to get the actual Synsets
						// and store them in the set
						mappedTo.addAll(sim.getSynsets(data[i]));
					}

					// if we have found some actual synsets then
					// store them in the domain mappings
					if (mappedTo.size() > 0)
						sim.domainMappings.put(data[0], mappedTo);
				}

				// get the next line from the file
				line = in.readLine();
			}

			// we have finished with the mappings file so close it
			in.close();
		}

		// make sure it is configured properly
		sim.config(params);

		// then return it
		return sim;
	}

	/**
	 * This is the method responsible for computing the similarity between two
	 * specific synsets. The method is implemented differently for each
	 * similarity measure so see the subclasses for detailed information.
	 * 
	 * @param s1
	 *            one of the synsets between which we want to know the
	 *            similarity.
	 * @param s2
	 *            the other synset between which we want to know the similarity.
	 * @return the similarity between the two synsets.
	 * @throws JWNLException
	 *             if an error occurs accessing WordNet.
	 */
	public abstract double getSimilarity(Synset s1, Synset s2)
			throws JWNLException;

	/**
	 * Get the similarity between two words. The words can be specified either
	 * as just the word or in an encoded form including the POS tag and possibly
	 * the sense number, i.e. cat#n#1 would specifiy the 1st sense of the noun
	 * cat.
	 * 
	 * @param w1
	 *            one of the words to compute similarity between.
	 * @param w2
	 *            the other word to compute similarity between.
	 * @return a SimilarityInfo instance detailing the similarity between the
	 *         two words specified.
	 * @throws JWNLException
	 *             if an error occurs accessing WordNet.
	 */
	public final SimilarityInfo getSimilarity(String w1, String w2)
			throws JWNLException {
		// Get the (possibly) multiple synsets associated with each word
		Set<Synset> ss1 = getSynsets(w1);
		Set<Synset> ss2 = getSynsets(w2);

		// assume the words are not at all similar
		SimilarityInfo sim = null;

		for (Synset s1 : ss1) {
			for (Synset s2 : ss2) {
				// for each pair of synsets get the similarity
				double score = getSimilarity(s1, s2);

				if (sim == null || score > sim.getSimilarity()) {
					// if the similarity is better than we have seen before
					// then create and store an info object describing the
					// similarity between the two synsets
					sim = new SimilarityInfo(w1, s1, w2, s2, score);
				}
			}
		}

		// return the maximum similarity we have found
		return sim;
	}

	/**
	 * Finds all the synsets associated with a specific word.
	 * 
	 * @param word
	 *            the word we are interested. Note that this may be encoded to
	 *            include information on POS tag and sense index.
	 * @return a set of synsets that are associated with the supplied word
	 * @throws JWNLException
	 *             if an error occurs accessing WordNet
	 */
	private final Set<Synset> getSynsets(String word) throws JWNLException {
		// get a handle on the WordNet dictionary
		Dictionary dict = Dictionary.getInstance();

		// create an emptuy set to hold any synsets we find
		Set<Synset> synsets = new HashSet<Synset>();

		// split the word on the # characters so we can get at the
		// upto three componets that could be present: word, POS tag, sense
		// index
		String[] data = word.split("#");

		// if the word is in the domainMappings then simply return the mappings
		if (domainMappings.containsKey(data[0]))
			return domainMappings.get(data[0]);

		if (data.length == 1) {
			// if there is just the word

			for (IndexWord iw : dict.lookupAllIndexWords(data[0])
					.getIndexWordArray()) {
				// for each matching word in WordNet add all it's senses to
				// the set we are building up
				synsets.addAll(Arrays.asList(iw.getSenses()));
			}

			// we have finihsed so return the synsets we found
			return synsets;
		}

		// the calling method specified a POS tag as well so get that
		POS pos = POS.getPOSForKey(data[1]);

		// if the POS tag isn't valid throw an exception
		if (pos == null)
			throw new JWNLException("Invalid POS Tag: " + data[1]);

		// get the word with the specified POS tag from WordNet
		IndexWord iw = dict.getIndexWord(pos, data[0]);

		if (data.length > 2) {
			// if the calling method specified a sense index then
			// add just that sysnet to the set we are creating
			synsets.add(iw.getSense(Integer.parseInt(data[2])));
		} else {
			// no sense index was specified so add all the senses of
			// the word to the set we are creating
			synsets.addAll(Arrays.asList(iw.getSenses()));
		}

		// return the set of synsets we found for the specified word
		return synsets;
	}
}