/* * Context.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.annotation; import java.util.* ; import java.text.* ; import org.wikipedia.miner.model.*; import org.wikipedia.miner.util.*; /** * A selection of unambiguous terms and their corresponding articles, which are used to resolve ambiguous terms. * * @author David Milne */ public class Context { private Vector<Article> contextArticles ; private float totalWeight ; private RelatednessCache relatednessCache ; /** * Initialises a collection of context articles from the given set of unambiguous anchors. * * @param unambigAnchors a set of unambiguous anchors, the most useful of which will be used to disambiguate other terms * @param relatednessCache a cache in which relatedness measures will be saved so they aren't repeatedly calculated. * @param maxSize the maximum number of anchors that will be used (the more there are, the longer disambiguation takes, but the more accurate it is likely to be). * @throws Exception */ /* public Context(Collection<Label> unambigLabels, RelatednessCache relatednessCache, int maxSize) throws Exception { this.relatednessCache = relatednessCache ; HashSet<Integer> doneIds = new HashSet<Integer>() ; Vector<Label.Sense> senses = new Vector<Label.Sense>() ; for (Label label: unambigLabels) { Label.Sense sense = label.getSenses()[0] ; if (!isDate(sense) && !doneIds.contains(sense.getId())) { sense.setWeight(label.getLinkProbability()) ; senses.add(sense) ; doneIds.add(sense.getId()) ; } } TreeSet<Article> sortedContextArticles = new TreeSet<Article>() ; for (Label.Sense s:senses) { double linkProb = s.getWeight() ; double avgRelatedness = 0 ; for (Label.Sense s2: senses) avgRelatedness += this.relatednessCache.getRelatedness(s, s2) ; avgRelatedness = avgRelatedness / (senses.size()) ; double weight = (linkProb + avgRelatedness + avgRelatedness)/3 ; s.setWeight(weight) ; sortedContextArticles.add(s) ; } contextArticles = new Vector<Article>() ; int c = 0 ; for (Article art: sortedContextArticles) { if (c++ > maxSize) break ; //System.out.println(" - cntxt art:" + art + ", w: " + art.getWeight()) ; totalWeight += art.getWeight() ; contextArticles.add(art) ; } }*/ /** * Initialises a collection of context articles from the given set of ambiguous anchors, * * @param ambigAnchors a set of ambiguous anchors, the most useful of which will be used to disambiguate other terms * @param relatednessCache a cache in which relatedness measures will be saved so they aren't repeatedly calculated. * @param maxSize the maximum number of anchors that will be used (the more there are, the longer disambiguation takes, but the more accurate it is likely to be). * @param minSenseLimit the minimum prior probability of an anchors sense that will be used as context. * @throws Exception */ public Context(Collection<Label> labels, RelatednessCache relatednessCache, int maxSize, double minSenseLimit) throws Exception { this.relatednessCache = relatednessCache ; int maxCandidates = maxSize*5 ; //first gather senses and sort them according to (label.linkProb * sense.priorProb) //only maintain a set of maxSize*5 so we don't bother adding all candidates ArrayList<Article> articles = new ArrayList<Article>() ; for (Label label:labels) { double lp = label.getLinkProbability() ; for (Label.Sense sense:label.getSenses()) { double sp = sense.getPriorProbability() ; //if below sp threshold, skip if (sp < minSenseLimit) break ; //if this is a date, skip if (isDate(sense)) continue ; sense.setWeight((lp + sp)/2) ; //sense.setWeight(sp) ; int index = Collections.binarySearch(articles, sense) ; //if already in list, skip if (index >= 0) continue ; index = (-1*index) -1 ; //if belongs at end of too large a set, skip if (index >= maxCandidates) continue ; articles.add(index, sense) ; if (articles.size() > maxCandidates) articles.remove(maxCandidates-1) ; } } //now weight candidates by their relatedness to each other for (Article art:articles) { double avgRelatedness = 0 ; for (Article art2:articles) { if (art.getId() != art2.getId()) { avgRelatedness += relatednessCache.getRelatedness(art, art2) ; } } avgRelatedness = avgRelatedness / (articles.size() - 1) ; art.setWeight((art.getWeight() + (4*avgRelatedness)) /5) ; } Collections.sort(articles) ; contextArticles = new Vector<Article>() ; int c = 0 ; for (Article art: articles) { if (c++ > maxSize) break ; //System.out.println("context: " + art + " " + art.getWeight()) ; totalWeight += art.getWeight() ; contextArticles.add(art) ; } } /** * @return the quality (size and homogeneity) of the available context. */ public float getQuality() { return totalWeight ; } /** * Compares the given article to all context anchors. * * @param art the article to be compared * @return the average relatedness between the article and context anchors * @throws Exception */ public double getRelatednessTo(Article art) throws Exception { if (contextArticles.size() == 0 || totalWeight == 0) return 0 ; double relatedness = 0 ; for (Article contextArt: contextArticles) { double r = relatednessCache.getRelatedness(art, contextArt) ; r = r * contextArt.getWeight() ; relatedness = relatedness + r ; } return relatedness / totalWeight ; } private boolean isDate(Article art) { SimpleDateFormat sdf = new SimpleDateFormat("MMMM d") ; Date date = null ; try { date = sdf.parse(art.getTitle()) ; } catch (ParseException e) { return false ; } return (date != null) ; } }