/************************************************************************ * Copyright (C) 2006-2007 The University of Sheffield * * Developed by Mark A. Greenwood <m.greenwood@dcs.shef.ac.uk> * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * ************************************************************************/ package edu.isistan.uima.unified.algorithms.similarity; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import net.didion.jwnl.JWNLException; import net.didion.jwnl.data.IndexWord; import net.didion.jwnl.data.POS; import net.didion.jwnl.data.Synset; import net.didion.jwnl.dictionary.Dictionary; /** * An abstract notion of a similarity measure that all provided implementations * extend. * * @author Mark A. Greenwood */ public abstract class SimilarityMeasure { /** * A mapping of terms to specific synsets. Usually used to map domain terms * to a restricted set of synsets but can also be used to map named entity * tags to appropriate synsets. */ private Map<String, Set<Synset>> domainMappings = new HashMap<String, Set<Synset>>(); /** * The maximum size the cache can grow to */ private int cacheSize = 5000; /** * To speed up computation of the similarity between two synsets we cache * each similarity that is computed so we only have to do each one once. */ @SuppressWarnings("serial") private Map<String, Double> cache = new LinkedHashMap<String, Double>(16, 0.75f, true) { public boolean removeEldestEntry(Map.Entry<String, Double> eldest) { // if the size is less than zero then the user is asking us // not to limit the size of the cache so return false if (cacheSize < 0) return false; // if the cache has crown bigger than it's max size return true return size() > cacheSize; } }; /** * Get a previously computed similarity between two synsets from the cache. * * @param s1 * the first synset between which we are looking for the * similarity. * @param s2 * the other synset between which we are looking for the * similarity. * @return The similarity between the two sets or null if it is not in the * cache. */ protected final Double getFromCache(Synset s1, Synset s2) { return cache.get(s1.getKey() + "-" + s2.getKey()); } /** * Add a computed similarity between two synsets to the cache so that we * don't have to compute it if it is needed in the future. * * @param s1 * one of the synsets between which we are storring a similarity. * @param s2 * the other synset between which we are storring a similarity. * @param sim * the similarity between the two supplied synsets. * @return the similarity score just added to the cache. */ protected final double addToCache(Synset s1, Synset s2, double sim) { cache.put(s1.getKey() + "-" + s2.getKey(), sim); return sim; } /** * Configures the similarity measure using the supplied parameters. * * @param params * a set of key-value pairs that are used to configure the * similarity measure. See concrete implementations for details * of expected/possible parameters. * @throws Exception * if an error occurs while configuring the similarity measure. */ protected abstract void config(Map<String, String> params) throws Exception; /** * Create a new instance of a similarity measure. * * @param confURL * the URL of a configuration file. Parameters are specified one * per line as key:value pairs. * @return a new instance of a similairy measure as defined by the supplied * configuration URL. * @throws Exception * if an error occurs while creating the similarity measure. */ public static SimilarityMeasure newInstance(URL confURL) throws Exception { // create map to hold the key-value pairs we are going to read from // the configuration file Map<String, String> params = new HashMap<String, String>(); // create a reader for the config file BufferedReader in = null; try { // open the config file in = new BufferedReader(new InputStreamReader(confURL.openStream())); String line = in.readLine(); while (line != null) { line = line.trim(); if (!line.equals("")) { // if the line contains something then // split the data so we get the key and value String[] data = line.split("\\s*:\\s*", 2); if (data.length == 2) { // if the line is valid add the two parts to the map params.put(data[0], data[1]); } else { // if the line isn't valid tell the user but continue on // with the rest of the file System.out.println("Config Line is Malformed: " + line); } } // get the next line ready to process line = in.readLine(); } } finally { // close the config file if it got opened if (in != null) in.close(); } // create and return a new instance of the similarity measure specified // by the config file return newInstance(params); } /** * Creates a new instance of a similarity measure using the supplied * parameters. * * @param params * a set of key-value pairs which define the similarity measure. * @return the newly created similarity measure. * @throws Exception * if an error occurs while creating the similarity measure. */ public static SimilarityMeasure newInstance(Map<String, String> params) throws Exception { // get the class name of the implementation we need to load String name = params.remove("simType"); // if the name hasn't been specified then throw an exception if (name == null) throw new Exception("Must specifiy the similarity measure to use"); // Get hold of the class we need to load @SuppressWarnings("unchecked") Class<SimilarityMeasure> c = (Class<SimilarityMeasure>) Class .forName(name); // create a new instance of the similarity measure SimilarityMeasure sim = c.newInstance(); // get the cache parameter from the config params String cSize = params.remove("cache"); // if a cache size was specified then set it if (cSize != null) sim.cacheSize = Integer.parseInt(cSize); // get the url of the domain mapping file String mapURL = params.remove("mapping"); if (mapURL != null) { // if a mapping file has been provided then // open a reader over the file BufferedReader in = new BufferedReader(new InputStreamReader( (new URL(mapURL)).openStream())); // get the first line ready for processing String line = in.readLine(); while (line != null) { if (!line.startsWith("#")) { // if the line isn't a comment (i.e. it doesn't start with // #) then... // split the line at the white space String[] data = line.trim().split("\\s+"); // create a new set to hold the mapped synsets Set<Synset> mappedTo = new HashSet<Synset>(); for (int i = 1; i < data.length; ++i) { // for each synset mapped to get the actual Synsets // and store them in the set mappedTo.addAll(sim.getSynsets(data[i])); } // if we have found some actual synsets then // store them in the domain mappings if (mappedTo.size() > 0) sim.domainMappings.put(data[0], mappedTo); } // get the next line from the file line = in.readLine(); } // we have finished with the mappings file so close it in.close(); } // make sure it is configured properly sim.config(params); // then return it return sim; } /** * This is the method responsible for computing the similarity between two * specific synsets. The method is implemented differently for each * similarity measure so see the subclasses for detailed information. * * @param s1 * one of the synsets between which we want to know the * similarity. * @param s2 * the other synset between which we want to know the similarity. * @return the similarity between the two synsets. * @throws JWNLException * if an error occurs accessing WordNet. */ public abstract double getSimilarity(Synset s1, Synset s2) throws JWNLException; /** * Get the similarity between two words. The words can be specified either * as just the word or in an encoded form including the POS tag and possibly * the sense number, i.e. cat#n#1 would specifiy the 1st sense of the noun * cat. * * @param w1 * one of the words to compute similarity between. * @param w2 * the other word to compute similarity between. * @return a SimilarityInfo instance detailing the similarity between the * two words specified. * @throws JWNLException * if an error occurs accessing WordNet. */ public final SimilarityInfo getSimilarity(String w1, String w2) throws JWNLException { // Get the (possibly) multiple synsets associated with each word Set<Synset> ss1 = getSynsets(w1); Set<Synset> ss2 = getSynsets(w2); // assume the words are not at all similar SimilarityInfo sim = null; for (Synset s1 : ss1) { for (Synset s2 : ss2) { // for each pair of synsets get the similarity double score = getSimilarity(s1, s2); if (sim == null || score > sim.getSimilarity()) { // if the similarity is better than we have seen before // then create and store an info object describing the // similarity between the two synsets sim = new SimilarityInfo(w1, s1, w2, s2, score); } } } // return the maximum similarity we have found return sim; } /** * Finds all the synsets associated with a specific word. * * @param word * the word we are interested. Note that this may be encoded to * include information on POS tag and sense index. * @return a set of synsets that are associated with the supplied word * @throws JWNLException * if an error occurs accessing WordNet */ private final Set<Synset> getSynsets(String word) throws JWNLException { // get a handle on the WordNet dictionary Dictionary dict = Dictionary.getInstance(); // create an emptuy set to hold any synsets we find Set<Synset> synsets = new HashSet<Synset>(); // split the word on the # characters so we can get at the // upto three componets that could be present: word, POS tag, sense // index String[] data = word.split("#"); // if the word is in the domainMappings then simply return the mappings if (domainMappings.containsKey(data[0])) return domainMappings.get(data[0]); if (data.length == 1) { // if there is just the word for (IndexWord iw : dict.lookupAllIndexWords(data[0]) .getIndexWordArray()) { // for each matching word in WordNet add all it's senses to // the set we are building up synsets.addAll(Arrays.asList(iw.getSenses())); } // we have finihsed so return the synsets we found return synsets; } // the calling method specified a POS tag as well so get that POS pos = POS.getPOSForKey(data[1]); // if the POS tag isn't valid throw an exception if (pos == null) throw new JWNLException("Invalid POS Tag: " + data[1]); // get the word with the specified POS tag from WordNet IndexWord iw = dict.getIndexWord(pos, data[0]); if (data.length > 2) { // if the calling method specified a sense index then // add just that sysnet to the set we are creating synsets.add(iw.getSense(Integer.parseInt(data[2]))); } else { // no sense index was specified so add all the senses of // the word to the set we are creating synsets.addAll(Arrays.asList(iw.getSenses())); } // return the set of synsets we found for the specified word return synsets; } }