/************************************************************************ * Copyright (C) 2006-2007 The University of Sheffield * * Developed by Mark A. Greenwood <m.greenwood@dcs.shef.ac.uk> * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * ************************************************************************/ package edu.isistan.uima.unified.algorithms.similarity; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import net.didion.jwnl.JWNL; import net.didion.jwnl.JWNLException; import net.didion.jwnl.data.POS; import net.didion.jwnl.data.Pointer; import net.didion.jwnl.data.PointerUtils; import net.didion.jwnl.data.Synset; import net.didion.jwnl.data.Word; import net.didion.jwnl.data.list.PointerTargetNode; /** * An abstract class that addes information content based methods to the * top level similarity measure class but doesn't itself define a * similarity measure. * @author Mark A. Greenwood */ public abstract class ICMeasure extends PathMeasure { /** * This map stores the synset IDs and there associated frequencies * as read from the supplied information content file. */ private Map<String,Double> freq = new HashMap<String,Double>(); protected void config(Map<String,String> params) throws Exception { super.config(params); //a handle to the infocontent file BufferedReader in = null; try { URL url = new URL(params.remove("infocontent")); //open the info content file for reading in = new BufferedReader(new InputStreamReader(url.openStream())); //get the first line from the file (should be the WordNet version info) String line = in.readLine(); //Check that what we have is actually a file of IC values if (line == null || !line.startsWith("wnver::")) throw new IOException("Malformed InfoContent file"); //Check that the IC file is meant for use with the version //of WordNet we are currently using if (!line.endsWith("::"+JWNL.getVersion().getNumber())) throw new Exception("InfoContent file version doesn't match WordNet version"); //Initially set the IC values of the noun and verb roots to 0 freq.put("n",0d); freq.put("v",0d); //Get the first line of real data ready for use line = in.readLine(); while (line != null && !line.equals("")) { //while there is still data in the file to process... //split the line on the whitespace String[] data = line.split("\\s+"); //store the frequency (2nd column) against the synset ID (1st column) freq.put(data[0],new Double(data[1])); if (data.length == 3 && data[2].equals("ROOT")) { //if there are three columns on this line and the //last one is ROOT then... //get the POS tag of the synset String pos = data[0].substring(data[0].length()-1); //updated the node frequency for the POS tag freq.put(pos, Double.parseDouble(data[1])+freq.get(pos)); } //read in the next line from the file ready for processing line = in.readLine(); } } finally { //if we managed to open the file then close it if (in != null) in.close(); } } /** * Generates the key to access the frequency count data loaded * from the information content file. * @param synset the synset for which to generate the key. * @return the key to access the frequency count map. */ protected String getFreqKey(Synset synset) { //the keys used by the infomation content files are simply //the offsets in the wordnet database (minus leading zeros) //followed by the single character POS tag. So simply build //a key of this type... return synset.getOffset()+synset.getPOS().getKey(); } /** * Gets the Information Content (IC) value associated with the given synset. * @param synset the synset for which to calcualte IC. * @return the IC of the given synset. */ protected double getIC(Synset synset) { //get the POS tag of this synset POS pos = synset.getPOS(); //Information Content is only defined for nouns and verbs //so return 0 if the POS tag is something else if (!pos.equals(POS.NOUN) && !pos.equals(POS.VERB)) return 0; //Get the frequency of this synset from the storred data Double synFreq = freq.get(getFreqKey(synset)); //if the frequency isn't defined or it's 0 then simlpy return 0 if (synFreq == null || synFreq.doubleValue() == 0) return 0; //Get the frequency of the root node for this POS tage Double rootFreq = freq.get(synset.getPOS().getKey()); //calcualte the probability for this synset double prob = synFreq.doubleValue() / rootFreq.doubleValue(); //if the probability is valid then use it to return the IC value if (prob > 0) return -Math.log(prob); //something went wrong so assume IC of 0 return 0; } /** * Returns the frequency of the root node of the hierarchy for the * given POS tag. * @param pos the POS tag of the root node to access * @return the frequency of the root node for the given POS tag */ protected double getFrequency(POS pos) { return freq.get(pos.getKey()); } /** * Returns the frequency of the given synset. * @param synset the synset to retrieve the frequency of * @return the frequency of the supplied synset */ protected double getFrequency(Synset synset) { Double f = freq.get(getFreqKey(synset)); if (f == null || f.doubleValue() == 0) return 0; return f.doubleValue(); } /** * Finds the lowerst common subsumer of the two synsets using information content. * @param s1 the first synset * @param s2 the second synset * @return the lowest common subsumer of the two provided synsets * @throws JWNLException if an error occurs accessing WordNet */ protected Synset getLCSbyIC(Synset s1, Synset s2) throws JWNLException { //TODO Handle the different types of LCS handled by the perl version which are // 1) Largest IC value // 2) Results in shortest path // 3) Greatest depth (i.e. the LCS whose shortest path to root is longest) //Although in here we only need the IC based one @SuppressWarnings("unchecked") List<List<PointerTargetNode>> trees1 = PointerUtils.getInstance().getHypernymTree(s1).toList(); @SuppressWarnings("unchecked") List<List<PointerTargetNode>> trees2 = PointerUtils.getInstance().getHypernymTree(s2).toList(); Set<Synset> pLCS = new HashSet<Synset>(); for (List<PointerTargetNode> t1 : trees1) { for (List<PointerTargetNode> t2 : trees2) { for (PointerTargetNode node : t1) { if (contains(t2,node.getSynset())) { pLCS.add(node.getSynset()); break; } } for (PointerTargetNode node : t2) { if (contains(t1,node.getSynset())) { pLCS.add(node.getSynset()); break; } } } } Synset lcs = null; double score = 0; for (Synset s : pLCS) { if (lcs == null) { lcs = s; score = getIC(s); } else { double ic = getIC(s); if (ic > score) { score = ic; lcs = s; } } } if (lcs == null && useSingleRoot()) { //link the two synsets by a fake root node //TODO: Should probably create one of these for each POS tag and cache them so that we can always return the same one lcs = new Synset(s1.getPOS(),0l,new Word[0],new Pointer[0],"",new java.util.BitSet()); } return lcs; } }