TopicDetector.java example

Explorer

wikipediaminer-master
- src
  - main
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        App.java
  - test
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        AppTest.java
- wikipedia-miner-core
  - src
    - main
      - java
        org
        wikipedia
        miner
        annotation
        ArticleCleaner.java
        Context.java
        Disambiguator.java
        Topic.java
        TopicDetector.java
        TopicReference.java
        preprocessing
        DocumentPreprocessor.java
        HtmlPreprocessor.java
        PreprocessedDocument.java
        WikiPreprocessor.java
        tagging
        DocumentTagger.java
        HtmlTagger.java
        WikiTagger.java
        weighting
        LinkDetector.java
        SimpleDocumentIndexer.java
        TopicIndexer.java
        TopicWeighter.java
        comparison
        ArticleComparer.java
        ArticleComparison.java
        ComparisonDataSet.java
        ConnectionSnippet.java
        ConnectionSnippetWeighter.java
        LabelComparer.java
        db
        IntObjectDatabase.java
        LabelDatabase.java
        MarkupDatabase.java
        PageLinkCountDatabase.java
        RecordBinding.java
        TitleDatabase.java
        WDatabase.java
        WDatabaseFactory.java
        WEntry.java
        WEnvironment.java
        WIterator.java
        struct
        DbIntList.java
        DbIntPair.java
        DbLabel.java
        DbLabelForPage.java
        DbLabelForPageList.java
        DbLinkLocation.java
        DbLinkLocationList.java
        DbPage.java
        DbPageLinkCounts.java
        DbSenseForLabel.java
        DbSentenceSplitList.java
        DbTranslations.java
        model
        Article.java
        Category.java
        Disambiguation.java
        Label.java
        Page.java
        Redirect.java
        Template.java
        Wikipedia.java
        util
        ArticleSet.java
        ArticleSetBuilder.java
        CorrelationCalculator.java
        EmphasisResolver.java
        EnvironmentBuilder.java
        LabelIterator.java
        MarkupStripper.java
        NGrammer.java
        PageIterator.java
        Position.java
        ProgressTracker.java
        RelatednessCache.java
        Result.java
        TopicIndexingSet.java
        WikipediaConfiguration.java
        text
        CaseAccentSimpleTextProcessor.java
        CaseFolder.java
        Cleaner.java
        PlingStemmer.java
        PorterStemmer.java
        SimpleStemmer.java
        StopwordRemover.java
        TextProcessor.java
        yagoUtils
        FinalMap.java
        FinalSet.java
    - test
      - java
        org
        wikipedia
        miner
        core
        AppTest.java
- wikipedia-miner-examples
  - src
    - main
      - java
        org
        wikipedia
        miner
        examples
        AnnotationWorkbench.java
        ComparisonWorkbench.java
        WikipediaDefiner.java
    - test
      - java
        org
        wikipedia
        miner
        examples
        AppTest.java
- wikipedia-miner-extract
  - src
    - main
      - java
        org
        wikipedia
        miner
        extract
        DumpExtractor.java
        model
        DumpLink.java
        DumpLinkParser.java
        DumpPage.java
        DumpPageParser.java
        struct
        LabelOccurrences.java
        LabelSense.java
        LabelSenseList.java
        LabelSummary.java
        LinkSummary.java
        PageDepthSummary.java
        PageDetail.java
        PageKey.java
        PageSummary.java
        PrimaryLabels.java
        steps
        IterativeStep.java
        LocalStep.java
        Step.java
        finalSummary
        FinalSummaryStep.java
        labelOccurrences
        CombinerOrReducer.java
        LabelCache.java
        LabelOccurrenceStep.java
        Mapper.java
        labelSenses
        CombinerOrReducer.java
        LabelSensesStep.java
        Mapper.java
        pageDepth
        DepthCombinerOrReducer.java
        InitialDepthMapper.java
        PageDepthStep.java
        SubsequentDepthMapper.java
        pageSummary
        CombinerOrReducer.java
        InitialMapper.java
        PageSummaryStep.java
        SubsequentMapper.java
        primaryLabel
        PrimaryLabelStep.java
        sortedPages
        PageSortingStep.java
        util
        Languages.java
        PageSentenceExtractor.java
        SiteInfo.java
        UncompletedStepException.java
        Util.java
        XmlInputFormat.java
    - test
      - java
        org
        wikipedia
        miner
        extract
        LinkMarkupHandling.java
        MarkupTestCase.java
        TestMarkupHandling.java
- wikipedia-miner-web
  - src
    - main
      - java
        org
        wikipedia
        miner
        web
        service
        CompareService.java
        CorrectService.java
        ExploreArticleService.java
        ExploreCategoryService.java
        ListWikipediasService.java
        ProgressService.java
        SearchService.java
        StatsService.java
        SuggestService.java
        WMHub.java
        WMService.java
        WikifyService.java
        util
        CharsetFilter.java
        HubConfiguration.java
        ImageRetriever.java
        MarkupFormatter.java
        UtilityMessages.java
        WebContentRetriever.java
        xjsfParameters
        StringListParameter.java

/*
 *    TopicDetector.java
 *    Copyright (C) 2007 David Milne, d.n.milne@gmail.com
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


package org.wikipedia.miner.annotation;

import java.io.*;
import java.util.*;
import java.util.regex.*;

import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.util.Span;

import org.wikipedia.miner.model.*;
import org.wikipedia.miner.model.Page.PageType;
import org.wikipedia.miner.util.*;
import org.wikipedia.miner.util.NGrammer.NGramSpan;
import org.wikipedia.miner.annotation.preprocessing.*;

/**
 * This class detects topics that occur in plain text, using Disambiguator to resolve ambiguous terms and phrases. 
 * Many of the detected topics will be rubbish (extracted from unhelpful terms, such as <em>and</em> or <em>the</em>, so you will probably want to use either a LinkDetector or 
 * some simple heuristics to weed out the least useful ones (see Topic for the features that are available for separating important topics from less helpful ones). 
 * <p>
 * This also doesn't resolve collisions (e.g. "united states" collides with "states of america" in "united states of america"). 
 * The DocumentTagger has methods to resolve these.
 * 
 *  @author David Milne 
 */
public class TopicDetector {
	
	public enum DisambiguationPolicy {STRICT, LOOSE} ;
	
	private Wikipedia wikipedia ;
	private Disambiguator disambiguator ;
	
	
	private DisambiguationPolicy disambigPolicy = DisambiguationPolicy.STRICT;
	private boolean allowDisambiguations = false ;
	
	private int maxTopicsForRelatedness = 25 ;
	
	private NGrammer nGrammer ;
	
	
	/**
	 * Initializes a new topic detector.
	 * 
	 * @param wikipedia an initialized instance of Wikipedia
	 * @param disambiguator a trained 
	 * @param stopwordFile an optional (may be null) file containing 
	 * @param strictDisambiguation
	 * @param allowDisambiguations 
	 * @throws IOException 
	 */
	public TopicDetector(Wikipedia wikipedia, Disambiguator disambiguator) throws IOException {
		this.wikipedia = wikipedia ;
		this.disambiguator = disambiguator ;
		
		this.nGrammer = new NGrammer(wikipedia.getConfig().getSentenceDetector(), wikipedia.getConfig().getTokenizer()) ;
		this.nGrammer.setMaxN(disambiguator.getMaxLabelLength()) ;
		
		//TODO:Check caching 
		/*
		if (!wikipedia.getEnvironment().isGeneralityCached()) 
			System.err.println("TopicDetector | Warning: generality has not been cached, so this will run significantly slower than it needs to.") ;
		*/	
		
		
		
	}
	
	public DisambiguationPolicy getDisambiguationPolicy() {
		return disambigPolicy ;
	}
	
	public void setDisambiguationPolicy(DisambiguationPolicy dp) {
		disambigPolicy = dp ;
	}
	
	public boolean areDisambiguationsAllowed() {
		return allowDisambiguations ;
	}
	
	public void allowDisambiguations(boolean val) {
		allowDisambiguations = val ;
	}
	
	/**
	 * Gathers a collection of topics from the given document. 
	 * 
	 * @param doc a document that has been preprocessed so that markup (html, mediawiki, etc) is safely ignored.
	 * @param rc a cache in which relatedness measures will be saved so they aren't repeatedly calculated. This may be null.  
	 * @return a vector of topics that were mined from the document.
	 * @throws Exception
	 */
	public Vector<Topic> getTopics(PreprocessedDocument doc, RelatednessCache rc) throws Exception {
		
		if (rc == null)
			rc = new RelatednessCache(disambiguator.getArticleComparer()) ;
		

		//Vector<String> sentences = ss.getSentences(doc.getPreprocessedText(), SentenceSplitter.MULTIPLE_NEWLINES) ;
		Vector<TopicReference> references = getReferences(doc.getPreprocessedText()) ;
		
		Collection<Topic> temp = getTopics(references, doc.getContextText(), doc.getOriginalText().length(), rc).values() ;
		calculateRelatedness(temp, rc) ;

		Vector<Topic> topics = new Vector<Topic>() ;
		for (Topic t:temp) {
			if (!doc.isTopicBanned(t.getId())) 
					topics.add(t) ;
		}
		
		return topics ;
	}
	
	/**
	 * Gathers a collection of topics from the given document. 
	 * 
	 * @param text text to mine topics from. This must be plain text, without any form of markup. 
	 * @param rc a cache in which relatedness measures will be saved so they aren't repeatedly calculated. This may be null. 
	 * @return a collection of topics that were mined from the document.
	 * @throws Exception
	 */
	public Collection<Topic> getTopics(String text, RelatednessCache rc) throws Exception {
		
		if (rc == null)
			rc = new RelatednessCache(disambiguator.getArticleComparer()) ;
			

		//Vector<String> sentences = ss.getSentences(text, SentenceSplitter.MULTIPLE_NEWLINES) ;
		Vector<TopicReference> references = getReferences(text) ;
		
		HashMap<Integer,Topic> topicsById = getTopics(references, "", text.length(), rc) ;

		Collection<Topic> topics = topicsById.values() ;
		calculateRelatedness(topics, rc) ;
		
		return topics ;
	}
	
	private void calculateRelatedness(Collection<Topic> topics, RelatednessCache cache) throws Exception{
		
		TreeSet<Article> weightedTopics = new TreeSet<Article>() ;
		
		for (Topic t:topics) {
			if (t.getType() != PageType.article)
				continue ;
			
			Article art = (Article)wikipedia.getPageById(t.getId()) ;
			
			art.setWeight(t.getAverageLinkProbability() * t.getOccurances()) ;
			weightedTopics.add(art) ;
		}
		
		for (Topic topic: topics) {
			
			double totalWeight = 0 ;
			double totalWeightedRelatedness = 0 ;
			
			int count = 0 ;
			
			for (Article art: weightedTopics) {
				if (count++ > maxTopicsForRelatedness)
					break ;
				
				double weightedRelatedness = art.getWeight() * cache.getRelatedness(topic, art) ;
				
				totalWeight = totalWeight + art.getWeight();
				totalWeightedRelatedness = totalWeightedRelatedness + weightedRelatedness;
				
			}
			
			topic.setRelatednessToOtherTopics((float)(totalWeightedRelatedness/totalWeight)) ;
		}
	}
	
	
	
	
	private Vector<TopicReference> getReferences(String text) {
		
		Vector<TopicReference> references = new Vector<TopicReference>() ;
		for (NGramSpan span:nGrammer.ngramPosDetect(text)) {
						
			Label label = wikipedia.getLabel(span, text) ;
			
			//System.out.println(" - " + label.getText() + ", " + label.exists() + ", " + label.getLinkProbability() + "," + label.getLinkDocCount()) ;
			
			if (!label.exists())
				continue ;
			
			if (label.getLinkProbability() < disambiguator.getMinLinkProbability())
				continue ;
			
			//if (label.getLinkDocCount() < wikipedia.getConfig().getMinLinksIn())
			//	continue ;
			
			
			//System.out.println("adding ref: " + label.getText()) ;
			TopicReference ref = new TopicReference(label, new Position(span.getStart(), span.getEnd())) ;
			references.add(ref) ;
		}
		return references ;
	}
	
	private HashMap<Integer,Topic> getTopics(Vector<TopicReference> references, String contextText, int docLength, RelatednessCache cache) throws Exception{
		HashMap<Integer,Topic> chosenTopics = new HashMap<Integer,Topic>() ;
	
		/*
		// get context articles from unambiguous Labels
		Vector<Label> unambigLabels = new Vector<Label>() ;
		for (TopicReference ref:references) {
			Label label = ref.getLabel() ;
			
			Label.Sense[] senses = label.getSenses() ;
			if (senses.length > 0) {				
				if (senses.length == 1 || senses[0].getPriorProbability() > 1-disambiguator.getMinSenseProbability())
					unambigLabels.add(label) ;	
			}		
		}
		
		//get context articles from additional context text
		for (TopicReference ref:getReferences(contextText)){
			Label label = ref.getLabel() ;
			Label.Sense[] senses = label.getSenses() ;
			if (senses.length > 0) {
				if (senses.length == 1 || senses[0].getPriorProbability() > 1-disambiguator.getMinSenseProbability()) {
					unambigLabels.add(label) ;	
				}
			}
		}
		*/
		
		HashSet<String> detectedLabels = new HashSet<String>() ;
		Vector<Label> labels = new Vector<Label>() ;
		for (TopicReference ref:references) {
			if (detectedLabels.contains(ref.getLabel().getText())) 
				continue ;
			
			labels.add(ref.getLabel()) ;
			detectedLabels.add(ref.getLabel().getText()) ;		
		}
		
		//get context articles from additional context text
		for (TopicReference ref:getReferences(contextText)){
			if (detectedLabels.contains(ref.getLabel().getText())) 
				continue ;
			
			labels.add(ref.getLabel()) ;
			detectedLabels.add(ref.getLabel().getText()) ;	
		}
		
		
		
		
		Context context ;
		if (cache == null)
			context = new Context(labels, new RelatednessCache(disambiguator.getArticleComparer()), disambiguator.getMaxContextSize(), disambiguator.getMinSenseProbability() * 5) ;
		else 
			context = new Context(labels, cache, disambiguator.getMaxContextSize(), disambiguator.getMinSenseProbability()) ;	
		
		labels = null ;

		//now disambiguate all references
		//unambig references are still processed here, because we need to calculate relatedness to context anyway.
		
		// build a cache of valid senses for each phrase, since the same phrase may occur more than once, but will always be disambiguated the same way
		HashMap<String, ArrayList<CachedSense>> disambigCache = new HashMap<String, ArrayList<CachedSense>>() ;

		for (TopicReference ref:references) {
			//System.out.println("disambiguating ref: " + ref.getLabel().getText()) ;

			ArrayList<CachedSense> validSenses = disambigCache.get(ref.getLabel().getText()) ;

			if (validSenses == null) {
				// we havent seen this label in this document before
				validSenses = new ArrayList<CachedSense>() ;

				for (Label.Sense sense: ref.getLabel().getSenses()) {
					
					if (sense.getPriorProbability() < disambiguator.getMinSenseProbability()) break ;
					
					if (!allowDisambiguations && sense.getType() == PageType.disambiguation)
						continue ;

					double relatedness = context.getRelatednessTo(sense) ;
					double commonness = sense.getPriorProbability() ;

					double disambigProb = disambiguator.getProbabilityOfSense(commonness, relatedness, context) ;

					//System.out.println(" - sense " + sense + ", " + disambigProb) ;
					
					if (disambigProb > 0.1) {
						// there is at least a chance that this is a valid sense for the link (there may be more than one)
						
						CachedSense vs = new CachedSense(sense.getId(), commonness, relatedness, disambigProb) ;
						validSenses.add(vs) ;
					}
				}
				Collections.sort(validSenses) ;
				
				
				disambigCache.put(ref.getLabel().getText(), validSenses) ;
			}

			if (disambigPolicy == DisambiguationPolicy.STRICT) {
				//just get top sense
				if (!validSenses.isEmpty()) {
					CachedSense sense = validSenses.get(0) ;
					Topic topic = chosenTopics.get(sense.id) ;
	
					if (topic == null) {
						// we havent seen this topic before
						topic = new Topic(wikipedia, sense.id, sense.relatedness, docLength) ;
						chosenTopics.put(sense.id, topic) ;
					}
					topic.addReference(ref, sense.disambigConfidence) ;
				}
			} else {
				//get all senses
				for (CachedSense sense: validSenses) {
					Topic topic = chosenTopics.get(sense.id) ;

					if (topic == null) {
						// we haven't seen this topic before
						topic = new Topic(wikipedia, sense.id, sense.relatedness, docLength) ;
						chosenTopics.put(sense.id, topic) ;
					}
					topic.addReference(ref, sense.disambigConfidence) ;
				}
			}
		}
		
		
		return chosenTopics ;
	}
	

	
	

	private class CachedSense implements Comparable<CachedSense>{
		
		int id ;
		double commonness ;
		double relatedness ;
		double disambigConfidence ;

		/**
		 * Initializes a new CachedSense
		 * 
		 * @param id the id of the article that represents this sense
		 * @param commonness the prior probability of this sense given a source ngram (label)
		 * @param relatedness the relatedness of this sense to the surrounding unambiguous topics
		 * @param disambigConfidence the probability that this sense is valid, as defined by the disambiguator.
		 */
		public CachedSense(int id, double commonness, double relatedness, double disambigConfidence) {
			this.id = id ;
			this.commonness = commonness ;
			this.relatedness = relatedness ;
			this.disambigConfidence = disambigConfidence ;			
		}
		
		public int compareTo(CachedSense sense) {
			return -1 * Double.valueOf(disambigConfidence).compareTo(Double.valueOf(sense.disambigConfidence)) ;
		}
	}
}