/* * Topic.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.annotation; import java.util.Vector; import org.wikipedia.miner.model.*; import org.wikipedia.miner.util.*; /** * This class represents a topic that was automatically detected and disambiguated in a document. * * @author David Milne */ public class Topic extends Article{ Vector<Position> positions ; private double relatednessToContext ; private double relatednessToAllTopics ; private double totalLinkProbability ; private double maxLinkProbability ; private double totalDisambigConfidence ; private double maxDisambigConfidence ; private double docLength ; /** * Initializes a new topic * * @param wikipedia an active instance of Wikipedia * @param id the id of the article that this topic represents * @param relatednessToContext the extent to which this topic relates to the surrounding unambiguous context * @param docLength the length of the document, in characters */ public Topic(Wikipedia wikipedia, int id, double relatednessToContext, double docLength) { super(wikipedia.getEnvironment(), id) ; this.relatednessToContext = relatednessToContext ; this.relatednessToAllTopics = -1 ; this.docLength = docLength ; positions = new Vector<Position>() ; totalLinkProbability = 0 ; maxLinkProbability = 0 ; totalDisambigConfidence = 0 ; maxDisambigConfidence = 0 ; } /** * Adds an ngram occurance in the document that refers to this topic * * @param reference the refering ngram (and it's location) * @param disambigConfidence the confidence with which the disambiguator chose this topic as the correct sense for the ngram */ public void addReference(TopicReference reference, double disambigConfidence){ positions.add(reference.getPosition()) ; double prob = reference.getLabel().getLinkProbability() ; totalLinkProbability = totalLinkProbability + prob ; if (prob > maxLinkProbability) maxLinkProbability = prob ; totalDisambigConfidence = totalDisambigConfidence + disambigConfidence ; if (disambigConfidence > maxDisambigConfidence) maxDisambigConfidence = disambigConfidence ; } /** * @return the locations in this document that refer to this topic */ public Vector<Position> getPositions() { return positions ; } /** * @return the number of times this topic is refered to. */ public int getOccurances() { return positions.size() ; } public double getNormalizedOccurances() { return Math.log(positions.size() + 1) ; } /** * @return the extent to which this topic relates to surrounding unambiguous context. */ public double getRelatednessToContext() { return relatednessToContext ; } /** * @return the extent to which this topic relates to all other topics detected in the document. * @throws Exception if this has not been calculated yet (this is the last step performed by the topic detector). */ public double getRelatednessToOtherTopics() throws Exception{ if (relatednessToAllTopics < 0) { throw new Exception("Relatedness to context not calcuated yet!") ; } return relatednessToAllTopics ; } /** * Sets the relatedness of this topic to all other topics detected in the document. * * @param r the extent to which this topic relates to all other topics detected in the document. */ protected void setRelatednessToOtherTopics(float r) { this.relatednessToAllTopics = r ; } /** * @return the maximum probability that the ngrams which refer to this topic would be links (rather than plain text) if found in a random wikipedia article. */ public double getMaxLinkProbability() { return maxLinkProbability ; } public double getNormalizedMaxLinkProbability() { double mlp = getMaxLinkProbability() ; mlp = Math.log((mlp*1000) + 1) ; mlp = mlp/4 ; return mlp ; } /** * @return the average probability that the ngrams which refer to this topic would be links (rather than plain text) if found in a random wikipedia article. */ public double getAverageLinkProbability() { return totalLinkProbability/positions.size() ; } public double getNormalizedAverageLinkProbability() { double alp = getAverageLinkProbability() ; alp = Math.log((alp*1000) + 1) ; alp = alp/4 ; return alp ; } /** * @return the maximum confidence with which the disambiguator chose this topic as the correct sense for the ngrams from which it was mined. */ public double getMaxDisambigConfidence() { return maxDisambigConfidence ; } /** * @return the average confidence with which the disambiguator chose this topic as the correct sense for the ngrams from which it was mined. */ public double getAverageDisambigConfidence() { return totalDisambigConfidence/positions.size() ; } /** * @return the distance between the start of the document and the first occurance of this topic, normalized by document length */ public double getFirstOccurance() { Position start = positions.firstElement() ; return ((double)start.getStart()) / docLength ; } /** * @return the distance between the end of the document and the last occurance of this topic, normalized by document length */ public double getLastOccurance() { Position end = positions.lastElement() ; return ((double)end.getStart()) / docLength ; } /** * @return the distance between the first and last occurances of this topic, normalized by document length */ public double getSpread() { return getLastOccurance() - getFirstOccurance() ; } }