/*
* TopicDetector.java
* Copyright (C) 2007 David Milne, d.n.milne@gmail.com
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.wikipedia.miner.annotation;
import java.io.*;
import java.util.*;
import java.util.regex.*;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.util.Span;
import org.wikipedia.miner.model.*;
import org.wikipedia.miner.model.Page.PageType;
import org.wikipedia.miner.util.*;
import org.wikipedia.miner.util.NGrammer.NGramSpan;
import org.wikipedia.miner.annotation.preprocessing.*;
/**
* This class detects topics that occur in plain text, using Disambiguator to resolve ambiguous terms and phrases.
* Many of the detected topics will be rubbish (extracted from unhelpful terms, such as <em>and</em> or <em>the</em>, so you will probably want to use either a LinkDetector or
* some simple heuristics to weed out the least useful ones (see Topic for the features that are available for separating important topics from less helpful ones).
* <p>
* This also doesn't resolve collisions (e.g. "united states" collides with "states of america" in "united states of america").
* The DocumentTagger has methods to resolve these.
*
* @author David Milne
*/
public class TopicDetector {
public enum DisambiguationPolicy {STRICT, LOOSE} ;
private Wikipedia wikipedia ;
private Disambiguator disambiguator ;
private DisambiguationPolicy disambigPolicy = DisambiguationPolicy.STRICT;
private boolean allowDisambiguations = false ;
private int maxTopicsForRelatedness = 25 ;
private NGrammer nGrammer ;
/**
* Initializes a new topic detector.
*
* @param wikipedia an initialized instance of Wikipedia
* @param disambiguator a trained
* @param stopwordFile an optional (may be null) file containing
* @param strictDisambiguation
* @param allowDisambiguations
* @throws IOException
*/
public TopicDetector(Wikipedia wikipedia, Disambiguator disambiguator) throws IOException {
this.wikipedia = wikipedia ;
this.disambiguator = disambiguator ;
this.nGrammer = new NGrammer(wikipedia.getConfig().getSentenceDetector(), wikipedia.getConfig().getTokenizer()) ;
this.nGrammer.setMaxN(disambiguator.getMaxLabelLength()) ;
//TODO:Check caching
/*
if (!wikipedia.getEnvironment().isGeneralityCached())
System.err.println("TopicDetector | Warning: generality has not been cached, so this will run significantly slower than it needs to.") ;
*/
}
public DisambiguationPolicy getDisambiguationPolicy() {
return disambigPolicy ;
}
public void setDisambiguationPolicy(DisambiguationPolicy dp) {
disambigPolicy = dp ;
}
public boolean areDisambiguationsAllowed() {
return allowDisambiguations ;
}
public void allowDisambiguations(boolean val) {
allowDisambiguations = val ;
}
/**
* Gathers a collection of topics from the given document.
*
* @param doc a document that has been preprocessed so that markup (html, mediawiki, etc) is safely ignored.
* @param rc a cache in which relatedness measures will be saved so they aren't repeatedly calculated. This may be null.
* @return a vector of topics that were mined from the document.
* @throws Exception
*/
public Vector<Topic> getTopics(PreprocessedDocument doc, RelatednessCache rc) throws Exception {
if (rc == null)
rc = new RelatednessCache(disambiguator.getArticleComparer()) ;
//Vector<String> sentences = ss.getSentences(doc.getPreprocessedText(), SentenceSplitter.MULTIPLE_NEWLINES) ;
Vector<TopicReference> references = getReferences(doc.getPreprocessedText()) ;
Collection<Topic> temp = getTopics(references, doc.getContextText(), doc.getOriginalText().length(), rc).values() ;
calculateRelatedness(temp, rc) ;
Vector<Topic> topics = new Vector<Topic>() ;
for (Topic t:temp) {
if (!doc.isTopicBanned(t.getId()))
topics.add(t) ;
}
return topics ;
}
/**
* Gathers a collection of topics from the given document.
*
* @param text text to mine topics from. This must be plain text, without any form of markup.
* @param rc a cache in which relatedness measures will be saved so they aren't repeatedly calculated. This may be null.
* @return a collection of topics that were mined from the document.
* @throws Exception
*/
public Collection<Topic> getTopics(String text, RelatednessCache rc) throws Exception {
if (rc == null)
rc = new RelatednessCache(disambiguator.getArticleComparer()) ;
//Vector<String> sentences = ss.getSentences(text, SentenceSplitter.MULTIPLE_NEWLINES) ;
Vector<TopicReference> references = getReferences(text) ;
HashMap<Integer,Topic> topicsById = getTopics(references, "", text.length(), rc) ;
Collection<Topic> topics = topicsById.values() ;
calculateRelatedness(topics, rc) ;
return topics ;
}
private void calculateRelatedness(Collection<Topic> topics, RelatednessCache cache) throws Exception{
TreeSet<Article> weightedTopics = new TreeSet<Article>() ;
for (Topic t:topics) {
if (t.getType() != PageType.article)
continue ;
Article art = (Article)wikipedia.getPageById(t.getId()) ;
art.setWeight(t.getAverageLinkProbability() * t.getOccurances()) ;
weightedTopics.add(art) ;
}
for (Topic topic: topics) {
double totalWeight = 0 ;
double totalWeightedRelatedness = 0 ;
int count = 0 ;
for (Article art: weightedTopics) {
if (count++ > maxTopicsForRelatedness)
break ;
double weightedRelatedness = art.getWeight() * cache.getRelatedness(topic, art) ;
totalWeight = totalWeight + art.getWeight();
totalWeightedRelatedness = totalWeightedRelatedness + weightedRelatedness;
}
topic.setRelatednessToOtherTopics((float)(totalWeightedRelatedness/totalWeight)) ;
}
}
private Vector<TopicReference> getReferences(String text) {
Vector<TopicReference> references = new Vector<TopicReference>() ;
for (NGramSpan span:nGrammer.ngramPosDetect(text)) {
Label label = wikipedia.getLabel(span, text) ;
//System.out.println(" - " + label.getText() + ", " + label.exists() + ", " + label.getLinkProbability() + "," + label.getLinkDocCount()) ;
if (!label.exists())
continue ;
if (label.getLinkProbability() < disambiguator.getMinLinkProbability())
continue ;
//if (label.getLinkDocCount() < wikipedia.getConfig().getMinLinksIn())
// continue ;
//System.out.println("adding ref: " + label.getText()) ;
TopicReference ref = new TopicReference(label, new Position(span.getStart(), span.getEnd())) ;
references.add(ref) ;
}
return references ;
}
private HashMap<Integer,Topic> getTopics(Vector<TopicReference> references, String contextText, int docLength, RelatednessCache cache) throws Exception{
HashMap<Integer,Topic> chosenTopics = new HashMap<Integer,Topic>() ;
/*
// get context articles from unambiguous Labels
Vector<Label> unambigLabels = new Vector<Label>() ;
for (TopicReference ref:references) {
Label label = ref.getLabel() ;
Label.Sense[] senses = label.getSenses() ;
if (senses.length > 0) {
if (senses.length == 1 || senses[0].getPriorProbability() > 1-disambiguator.getMinSenseProbability())
unambigLabels.add(label) ;
}
}
//get context articles from additional context text
for (TopicReference ref:getReferences(contextText)){
Label label = ref.getLabel() ;
Label.Sense[] senses = label.getSenses() ;
if (senses.length > 0) {
if (senses.length == 1 || senses[0].getPriorProbability() > 1-disambiguator.getMinSenseProbability()) {
unambigLabels.add(label) ;
}
}
}
*/
HashSet<String> detectedLabels = new HashSet<String>() ;
Vector<Label> labels = new Vector<Label>() ;
for (TopicReference ref:references) {
if (detectedLabels.contains(ref.getLabel().getText()))
continue ;
labels.add(ref.getLabel()) ;
detectedLabels.add(ref.getLabel().getText()) ;
}
//get context articles from additional context text
for (TopicReference ref:getReferences(contextText)){
if (detectedLabels.contains(ref.getLabel().getText()))
continue ;
labels.add(ref.getLabel()) ;
detectedLabels.add(ref.getLabel().getText()) ;
}
Context context ;
if (cache == null)
context = new Context(labels, new RelatednessCache(disambiguator.getArticleComparer()), disambiguator.getMaxContextSize(), disambiguator.getMinSenseProbability() * 5) ;
else
context = new Context(labels, cache, disambiguator.getMaxContextSize(), disambiguator.getMinSenseProbability()) ;
labels = null ;
//now disambiguate all references
//unambig references are still processed here, because we need to calculate relatedness to context anyway.
// build a cache of valid senses for each phrase, since the same phrase may occur more than once, but will always be disambiguated the same way
HashMap<String, ArrayList<CachedSense>> disambigCache = new HashMap<String, ArrayList<CachedSense>>() ;
for (TopicReference ref:references) {
//System.out.println("disambiguating ref: " + ref.getLabel().getText()) ;
ArrayList<CachedSense> validSenses = disambigCache.get(ref.getLabel().getText()) ;
if (validSenses == null) {
// we havent seen this label in this document before
validSenses = new ArrayList<CachedSense>() ;
for (Label.Sense sense: ref.getLabel().getSenses()) {
if (sense.getPriorProbability() < disambiguator.getMinSenseProbability()) break ;
if (!allowDisambiguations && sense.getType() == PageType.disambiguation)
continue ;
double relatedness = context.getRelatednessTo(sense) ;
double commonness = sense.getPriorProbability() ;
double disambigProb = disambiguator.getProbabilityOfSense(commonness, relatedness, context) ;
//System.out.println(" - sense " + sense + ", " + disambigProb) ;
if (disambigProb > 0.1) {
// there is at least a chance that this is a valid sense for the link (there may be more than one)
CachedSense vs = new CachedSense(sense.getId(), commonness, relatedness, disambigProb) ;
validSenses.add(vs) ;
}
}
Collections.sort(validSenses) ;
disambigCache.put(ref.getLabel().getText(), validSenses) ;
}
if (disambigPolicy == DisambiguationPolicy.STRICT) {
//just get top sense
if (!validSenses.isEmpty()) {
CachedSense sense = validSenses.get(0) ;
Topic topic = chosenTopics.get(sense.id) ;
if (topic == null) {
// we havent seen this topic before
topic = new Topic(wikipedia, sense.id, sense.relatedness, docLength) ;
chosenTopics.put(sense.id, topic) ;
}
topic.addReference(ref, sense.disambigConfidence) ;
}
} else {
//get all senses
for (CachedSense sense: validSenses) {
Topic topic = chosenTopics.get(sense.id) ;
if (topic == null) {
// we haven't seen this topic before
topic = new Topic(wikipedia, sense.id, sense.relatedness, docLength) ;
chosenTopics.put(sense.id, topic) ;
}
topic.addReference(ref, sense.disambigConfidence) ;
}
}
}
return chosenTopics ;
}
private class CachedSense implements Comparable<CachedSense>{
int id ;
double commonness ;
double relatedness ;
double disambigConfidence ;
/**
* Initializes a new CachedSense
*
* @param id the id of the article that represents this sense
* @param commonness the prior probability of this sense given a source ngram (label)
* @param relatedness the relatedness of this sense to the surrounding unambiguous topics
* @param disambigConfidence the probability that this sense is valid, as defined by the disambiguator.
*/
public CachedSense(int id, double commonness, double relatedness, double disambigConfidence) {
this.id = id ;
this.commonness = commonness ;
this.relatedness = relatedness ;
this.disambigConfidence = disambigConfidence ;
}
public int compareTo(CachedSense sense) {
return -1 * Double.valueOf(disambigConfidence).compareTo(Double.valueOf(sense.disambigConfidence)) ;
}
}
}