Wikipedia.java example

Explorer

wikipediaminer-master
- src
  - main
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        App.java
  - test
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        AppTest.java
- wikipedia-miner-core
  - src
    - main
      - java
        org
        wikipedia
        miner
        annotation
        ArticleCleaner.java
        Context.java
        Disambiguator.java
        Topic.java
        TopicDetector.java
        TopicReference.java
        preprocessing
        DocumentPreprocessor.java
        HtmlPreprocessor.java
        PreprocessedDocument.java
        WikiPreprocessor.java
        tagging
        DocumentTagger.java
        HtmlTagger.java
        WikiTagger.java
        weighting
        LinkDetector.java
        SimpleDocumentIndexer.java
        TopicIndexer.java
        TopicWeighter.java
        comparison
        ArticleComparer.java
        ArticleComparison.java
        ComparisonDataSet.java
        ConnectionSnippet.java
        ConnectionSnippetWeighter.java
        LabelComparer.java
        db
        IntObjectDatabase.java
        LabelDatabase.java
        MarkupDatabase.java
        PageLinkCountDatabase.java
        RecordBinding.java
        TitleDatabase.java
        WDatabase.java
        WDatabaseFactory.java
        WEntry.java
        WEnvironment.java
        WIterator.java
        struct
        DbIntList.java
        DbIntPair.java
        DbLabel.java
        DbLabelForPage.java
        DbLabelForPageList.java
        DbLinkLocation.java
        DbLinkLocationList.java
        DbPage.java
        DbPageLinkCounts.java
        DbSenseForLabel.java
        DbSentenceSplitList.java
        DbTranslations.java
        model
        Article.java
        Category.java
        Disambiguation.java
        Label.java
        Page.java
        Redirect.java
        Template.java
        Wikipedia.java
        util
        ArticleSet.java
        ArticleSetBuilder.java
        CorrelationCalculator.java
        EmphasisResolver.java
        EnvironmentBuilder.java
        LabelIterator.java
        MarkupStripper.java
        NGrammer.java
        PageIterator.java
        Position.java
        ProgressTracker.java
        RelatednessCache.java
        Result.java
        TopicIndexingSet.java
        WikipediaConfiguration.java
        text
        CaseAccentSimpleTextProcessor.java
        CaseFolder.java
        Cleaner.java
        PlingStemmer.java
        PorterStemmer.java
        SimpleStemmer.java
        StopwordRemover.java
        TextProcessor.java
        yagoUtils
        FinalMap.java
        FinalSet.java
    - test
      - java
        org
        wikipedia
        miner
        core
        AppTest.java
- wikipedia-miner-examples
  - src
    - main
      - java
        org
        wikipedia
        miner
        examples
        AnnotationWorkbench.java
        ComparisonWorkbench.java
        WikipediaDefiner.java
    - test
      - java
        org
        wikipedia
        miner
        examples
        AppTest.java
- wikipedia-miner-extract
  - src
    - main
      - java
        org
        wikipedia
        miner
        extract
        DumpExtractor.java
        model
        DumpLink.java
        DumpLinkParser.java
        DumpPage.java
        DumpPageParser.java
        struct
        LabelOccurrences.java
        LabelSense.java
        LabelSenseList.java
        LabelSummary.java
        LinkSummary.java
        PageDepthSummary.java
        PageDetail.java
        PageKey.java
        PageSummary.java
        PrimaryLabels.java
        steps
        IterativeStep.java
        LocalStep.java
        Step.java
        finalSummary
        FinalSummaryStep.java
        labelOccurrences
        CombinerOrReducer.java
        LabelCache.java
        LabelOccurrenceStep.java
        Mapper.java
        labelSenses
        CombinerOrReducer.java
        LabelSensesStep.java
        Mapper.java
        pageDepth
        DepthCombinerOrReducer.java
        InitialDepthMapper.java
        PageDepthStep.java
        SubsequentDepthMapper.java
        pageSummary
        CombinerOrReducer.java
        InitialMapper.java
        PageSummaryStep.java
        SubsequentMapper.java
        primaryLabel
        PrimaryLabelStep.java
        sortedPages
        PageSortingStep.java
        util
        Languages.java
        PageSentenceExtractor.java
        SiteInfo.java
        UncompletedStepException.java
        Util.java
        XmlInputFormat.java
    - test
      - java
        org
        wikipedia
        miner
        extract
        LinkMarkupHandling.java
        MarkupTestCase.java
        TestMarkupHandling.java
- wikipedia-miner-web
  - src
    - main
      - java
        org
        wikipedia
        miner
        web
        service
        CompareService.java
        CorrectService.java
        ExploreArticleService.java
        ExploreCategoryService.java
        ListWikipediasService.java
        ProgressService.java
        SearchService.java
        StatsService.java
        SuggestService.java
        WMHub.java
        WMService.java
        WikifyService.java
        util
        CharsetFilter.java
        HubConfiguration.java
        ImageRetriever.java
        MarkupFormatter.java
        UtilityMessages.java
        WebContentRetriever.java
        xjsfParameters
        StringListParameter.java

/*
 *    Wikipedia.java
 *    Copyright (C) 2007 David Milne, d.n.milne@gmail.com
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.wikipedia.miner.model;

import java.io.File;
import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;


import org.apache.log4j.Logger;
import org.wikipedia.miner.db.WEnvironment;
import org.wikipedia.miner.db.WIterator;
import org.wikipedia.miner.db.WEnvironment.StatisticName;
import org.wikipedia.miner.db.struct.DbLabel;
import org.wikipedia.miner.model.Page.PageType;
import org.wikipedia.miner.util.LabelIterator;
import org.wikipedia.miner.util.NGrammer.CaseContext;
import org.wikipedia.miner.util.NGrammer.NGramSpan;
import org.wikipedia.miner.util.PageIterator;
import org.wikipedia.miner.util.ProgressTracker;
import org.wikipedia.miner.util.WikipediaConfiguration;
import org.wikipedia.miner.util.text.TextProcessor;
import org.xml.sax.SAXException;

import com.sleepycat.je.EnvironmentLockedException;


/**
 * Represents a single dump or instance of Wikipedia
 */
public class Wikipedia {

	private WEnvironment env ;

	/**
	 * Initialises a newly created Wikipedia according to the given configuration. 
	 * 
	 * This can be a time consuming process if the given configuration specifies databases that need to be cached to memory.
	 * 
	 * This preparation can be done in a separate thread if required, in which case progress can be tracked using {@link #getProgress()}, {@link #getPreparationTracker()} and {@link #isReady()}.
	 *  
	 * @param conf a configuration that describes where the databases are located, etc. 
	 * @param threadedPreparation true if preparation (connecting to databases, caching data to memory) should be done in a separate thread, otherwise false
	 * @throws EnvironmentLockedException if the underlying database environment is unavailable.
	 */
	public Wikipedia(WikipediaConfiguration conf, boolean threadedPreparation) throws EnvironmentLockedException{
		this.env = new WEnvironment(conf, threadedPreparation) ; 
	}

	/**
	 * Initialises a newly created Wikipedia according to the given configuration file. 
	 * 
	 * This can be a time consuming process if the given configuration specifies databases that need to be cached to memory.
	 * 
	 * This preparation can be done in a separate thread if required, in which case progress can be tracked using {@link #getProgress()}, {@link #getPreparationTracker()} and {@link #isReady()}.
	 *  
	 * @param confFile an xml file that describes where the databases are located, etc. 
	 * @param threadedPreparation true if preparation (connecting to databases, caching data to memory) should be done in a separate thread, otherwise false
	 * @throws EnvironmentLockedException if the underlying database environment is unavailable.
	 */
	public Wikipedia(File confFile, boolean threadedPreparation) throws EnvironmentLockedException, ParserConfigurationException, SAXException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
		WikipediaConfiguration conf = new WikipediaConfiguration(confFile) ;
		this.env = new WEnvironment(conf, threadedPreparation) ; 
	}


	/**
	 * Returns the environment that this is connected to
	 * 
	 * @return the environment that this is connected to
	 */
	public WEnvironment getEnvironment() {
		return env ;
	}

	/**
	 * Returns the configuration of this wikipedia dump
	 * 
	 * @return the configuration of this wikipedia dump
	 */
	public WikipediaConfiguration getConfig() {
		return env.getConfiguration() ;
	}

	/**
	 * Returns true if the preparation work has been completed, otherwise false
	 * 
	 * @return true if the preparation work has been completed, otherwise false
	 */
	public boolean isReady() {
		return env.isReady() ;

	}

	/**
	 * Returns a number between 0 (just started) and 1 (completed) indicating progress of the preparation work.
	 * 
	 * @return a number between 0 (just started) and 1 (completed) indicating progress of the preparation work. 
	 */
	public double getProgress() {
		return env.getProgress() ;
	}

	/**
	 * Returns a tracker for progress of the preparation work. 
	 * 
	 * @return a tracker for progress of the preparation work. 
	 */
	public ProgressTracker getPreparationTracker() {
		return env.getPreparationTracker() ;
	}

	/**
	 * Returns the root Category from which all other categories can be browsed.
	 * 
	 * @return the root category
	 */
	public Category getRootCategory() {

		return new Category(env, env.retrieveStatistic(StatisticName.rootCategoryId).intValue()) ;
	}

	/**
	 * Returns the Page referenced by the given id. The page can be cast into the appropriate type for 
	 * more specific functionality. 
	 *  
	 * @param id	the id of the Page to retrieve.
	 * @return the Page referenced by the given id, or null if one does not exist. 
	 */
	public Page getPageById(int id) {
		return Page.createPage(env, id) ;
	}

	/**
	 * Returns the Article referenced by the given (case sensitive) title. If the title
	 * matches a redirect, this will be resolved to return the redirect's target.
	 * <p>
	 * The given title must be matched exactly to return an article. If you want some more lee-way,
	 * use getMostLikelyArticle() instead. 
	 *  
	 * @param title	the title of an Article (or its redirect).
	 * @return the Article referenced by the given title, or null if one does not exist
	 */
	public Article getArticleByTitle(String title) {

		if (title == null || title.length() == 0)
			return null ;

		title = title.substring(0,1).toUpperCase() + title.substring(1) ;

		Integer id = env.getDbArticlesByTitle().retrieve(title) ;

		if (id == null)
			return null ;

		Page page = Page.createPage(env, id) ;
		if (!page.exists())
			return null ;

		if (page.getType() == PageType.redirect)
			return ((Redirect)page).getTarget() ;
		else
			return (Article)page ;
	}

	/**
	 * Returns the Category referenced by the given (case sensitive) title. 
	 * 
	 * The given title must be matched exactly to return a Category. 
	 *  
	 * @param title	the title of an Article (or it's redirect).
	 * @return the Article referenced by the given title, or null if one does not exist
	 */
	public Category getCategoryByTitle(String title) {

		title = title.substring(0,1).toUpperCase() + title.substring(1) ;

		Integer id = env.getDbCategoriesByTitle().retrieve(title) ;

		if (id == null)
			return null ;

		Page page = Page.createPage(env, id) ;

		if (page.getType() == PageType.category)
			return (Category) page ;
		else
			return null ;
	}

	/**
	 * Returns the Template referenced by the given (case sensitive) title. 
	 * 
	 * The given title must be matched exactly to return a Template. 
	 *  
	 * @param title the title of a Template.
	 * @return the Template referenced by the given title, or null if one does not exist
	 */
	public Template getTemplateByTitle(String title) {

		title = title.substring(0,1).toUpperCase() + title.substring(1) ;

		Integer id = env.getDbTemplatesByTitle().retrieve(title) ;

		if (id == null)
			return null ;

		Page page = Page.createPage(env, id) ;

		if (page.getType() == PageType.template)
			return (Template) page ;
		else
			return null ;
	}



	/**
	 * Returns the most likely article for a given term. For example, searching for "tree" will return
	 * the article "30579: Tree", rather than "30806: Tree (data structure)" or "7770: Christmas tree"
	 * This is defined by the number of times the term is used as an anchor for links to each of these 
	 * destinations. 
	 *  <p>
	 * An optional text processor (may be null) can be used to alter the way labels are 
	 * retrieved (e.g. via stemming or case folding) 
	 * 
	 * @param term	the term to obtain articles for
	 * @param tp	an optional TextProcessor to modify how the term is searched for. 
	 * 
	 * @return the most likely sense of the given term.
	 * 
	 * for the given text processor.
	 */
	public Article getMostLikelyArticle(String term, TextProcessor tp){

		Label label = new Label(env, term, tp) ;

		if (!label.exists()) 
			return null ;

		return label.getSenses()[0] ;
	}

	/**
	 * A convenience method for quickly finding out if the given text is ever used as a label
	 * in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets. 
	 * 
	 * @param text the text to search for
	 * @param tp an optional TextProcessor (may be null)
	 * @return true if there is an anchor corresponding to the given text, otherwise false
	 */
	public boolean isLabel(String text, TextProcessor tp)  {
		DbLabel lbl = env.getDbLabel(tp).retrieve(text) ; 

		return lbl != null ;
	}
	
	public Label getLabel(NGramSpan span, String sourceText) {
		
		//System.out.println("context: " + span.getCaseContext()) ;
		
		String ngram = span.getNgram(sourceText) ;
		
		Label bestLabel = getLabel(ngram) ;
		
		//don't bother trying out different casing variations if we are using casefolder as text processor
		//TextProcessor tp = getConfig().getDefaultTextProcessor() ;
		//if (tp != null && (tp.class. == TextProcessor.class)
		///	return bestLabel ;
		
		//if this starts with capital letter and is at start of sentence, try lower-casing that first letter.
		if (span.getCaseContext() == CaseContext.mixed && span.isSentenceStart() && Character.isUpperCase(ngram.charAt(0))) {
			//System.out.println("trying lower first letter first token") ;
			char tmpNgram[] = ngram.toCharArray() ;
			tmpNgram[0] = Character.toLowerCase(tmpNgram[0]) ;
			
			Label label = getLabel(new String(tmpNgram)) ;
			
			//System.out.println(label.getText()) ;
			
			if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
				bestLabel = label ;
				//System.out.println("using lower first letter first token") ;
			}
		}
			
		
		
		//if surrounding text is all lower case or ALL UPPER CASE, try with First Letter Of Each Token Uppercased.
		if (span.getCaseContext() == CaseContext.lower || span.getCaseContext() == CaseContext.upper) {
			//System.out.println("trying upper first letter all tokens") ;
			Label label = getLabel(span.getNgramUpperFirst(sourceText)) ;
			
			if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
				bestLabel = label ;
				//System.out.println("using upper first letter all tokens") ;
			}
		}
		
		//if surrounding text is ALL UPPER CASE or Has First Letter Of Each Token Uppercased, try with all lower case 
		if (span.getCaseContext() == CaseContext.upperFirst || span.getCaseContext() == CaseContext.upper) {
			//System.out.println("trying lower") ;
			Label label = getLabel(ngram.toLowerCase()) ;
			
			if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
				bestLabel = label ;
				//System.out.println("using lower") ;
			}
		}
		
		//if surrounding text is all lower case, try with ALL UPPER CASE
		if (span.getCaseContext() == CaseContext.lower) {
			//System.out.println("trying upper") ;
			Label label = getLabel(ngram.toUpperCase()) ;
			
			if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
				//System.out.println("using upper") ;
				bestLabel = label ;
			}
		}
		
		return bestLabel ;
	}

	public Label getLabel(String text)  {
		return new Label(env, text) ;
	}

	public Label getLabel(String text, TextProcessor tp)  {

		return new Label(env, text, tp) ;
	}


	/**
	 * Returns an iterator for all pages in the database, in order of ascending ids.
	 * 
	 * @return an iterator for all pages in the database, in order of ascending ids.
	 */
	public PageIterator getPageIterator() {
		return new PageIterator(env) ;
	}

	/**
	 * Returns an iterator for all pages in the database of the given type, in order of ascending ids.
	 * 
	 * @param type the type of page of interest
	 * @return an iterator for all pages in the database of the given type, in order of ascending ids.
	 */
	public PageIterator getPageIterator(PageType type) {
		return new PageIterator(env, type) ;		
	}

	/**
	 * Returns an iterator for all labels in the database, processed according to the given text processor (may be null), in alphabetical order.
	 * 
	 * @param tp the text processor
	 * @return an iterator for all labels in the database, processed according to the given text processor (may be null), in alphabetical order.
	 */
	public LabelIterator getLabelIterator(TextProcessor tp) {
		return new LabelIterator(env, tp) ;
	}

	/**
	 * Tidily closes the database environment behind this wikipedia instance. This should be done whenever
	 * one is finished using it. 
	 */
	public void close() {
		env.close();
		this.env = null ;
	}

	@Override
	public void finalize() {
		if (this.env != null)
			Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ;
	}
}