/*
* Wikipedia.java
* Copyright (C) 2007 David Milne, d.n.milne@gmail.com
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.wikipedia.miner.model;
import java.io.File;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.log4j.Logger;
import org.wikipedia.miner.db.WEnvironment;
import org.wikipedia.miner.db.WIterator;
import org.wikipedia.miner.db.WEnvironment.StatisticName;
import org.wikipedia.miner.db.struct.DbLabel;
import org.wikipedia.miner.model.Page.PageType;
import org.wikipedia.miner.util.LabelIterator;
import org.wikipedia.miner.util.NGrammer.CaseContext;
import org.wikipedia.miner.util.NGrammer.NGramSpan;
import org.wikipedia.miner.util.PageIterator;
import org.wikipedia.miner.util.ProgressTracker;
import org.wikipedia.miner.util.WikipediaConfiguration;
import org.wikipedia.miner.util.text.TextProcessor;
import org.xml.sax.SAXException;
import com.sleepycat.je.EnvironmentLockedException;
/**
* Represents a single dump or instance of Wikipedia
*/
public class Wikipedia {
private WEnvironment env ;
/**
* Initialises a newly created Wikipedia according to the given configuration.
*
* This can be a time consuming process if the given configuration specifies databases that need to be cached to memory.
*
* This preparation can be done in a separate thread if required, in which case progress can be tracked using {@link #getProgress()}, {@link #getPreparationTracker()} and {@link #isReady()}.
*
* @param conf a configuration that describes where the databases are located, etc.
* @param threadedPreparation true if preparation (connecting to databases, caching data to memory) should be done in a separate thread, otherwise false
* @throws EnvironmentLockedException if the underlying database environment is unavailable.
*/
public Wikipedia(WikipediaConfiguration conf, boolean threadedPreparation) throws EnvironmentLockedException{
this.env = new WEnvironment(conf, threadedPreparation) ;
}
/**
* Initialises a newly created Wikipedia according to the given configuration file.
*
* This can be a time consuming process if the given configuration specifies databases that need to be cached to memory.
*
* This preparation can be done in a separate thread if required, in which case progress can be tracked using {@link #getProgress()}, {@link #getPreparationTracker()} and {@link #isReady()}.
*
* @param confFile an xml file that describes where the databases are located, etc.
* @param threadedPreparation true if preparation (connecting to databases, caching data to memory) should be done in a separate thread, otherwise false
* @throws EnvironmentLockedException if the underlying database environment is unavailable.
*/
public Wikipedia(File confFile, boolean threadedPreparation) throws EnvironmentLockedException, ParserConfigurationException, SAXException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
WikipediaConfiguration conf = new WikipediaConfiguration(confFile) ;
this.env = new WEnvironment(conf, threadedPreparation) ;
}
/**
* Returns the environment that this is connected to
*
* @return the environment that this is connected to
*/
public WEnvironment getEnvironment() {
return env ;
}
/**
* Returns the configuration of this wikipedia dump
*
* @return the configuration of this wikipedia dump
*/
public WikipediaConfiguration getConfig() {
return env.getConfiguration() ;
}
/**
* Returns true if the preparation work has been completed, otherwise false
*
* @return true if the preparation work has been completed, otherwise false
*/
public boolean isReady() {
return env.isReady() ;
}
/**
* Returns a number between 0 (just started) and 1 (completed) indicating progress of the preparation work.
*
* @return a number between 0 (just started) and 1 (completed) indicating progress of the preparation work.
*/
public double getProgress() {
return env.getProgress() ;
}
/**
* Returns a tracker for progress of the preparation work.
*
* @return a tracker for progress of the preparation work.
*/
public ProgressTracker getPreparationTracker() {
return env.getPreparationTracker() ;
}
/**
* Returns the root Category from which all other categories can be browsed.
*
* @return the root category
*/
public Category getRootCategory() {
return new Category(env, env.retrieveStatistic(StatisticName.rootCategoryId).intValue()) ;
}
/**
* Returns the Page referenced by the given id. The page can be cast into the appropriate type for
* more specific functionality.
*
* @param id the id of the Page to retrieve.
* @return the Page referenced by the given id, or null if one does not exist.
*/
public Page getPageById(int id) {
return Page.createPage(env, id) ;
}
/**
* Returns the Article referenced by the given (case sensitive) title. If the title
* matches a redirect, this will be resolved to return the redirect's target.
* <p>
* The given title must be matched exactly to return an article. If you want some more lee-way,
* use getMostLikelyArticle() instead.
*
* @param title the title of an Article (or its redirect).
* @return the Article referenced by the given title, or null if one does not exist
*/
public Article getArticleByTitle(String title) {
if (title == null || title.length() == 0)
return null ;
title = title.substring(0,1).toUpperCase() + title.substring(1) ;
Integer id = env.getDbArticlesByTitle().retrieve(title) ;
if (id == null)
return null ;
Page page = Page.createPage(env, id) ;
if (!page.exists())
return null ;
if (page.getType() == PageType.redirect)
return ((Redirect)page).getTarget() ;
else
return (Article)page ;
}
/**
* Returns the Category referenced by the given (case sensitive) title.
*
* The given title must be matched exactly to return a Category.
*
* @param title the title of an Article (or it's redirect).
* @return the Article referenced by the given title, or null if one does not exist
*/
public Category getCategoryByTitle(String title) {
title = title.substring(0,1).toUpperCase() + title.substring(1) ;
Integer id = env.getDbCategoriesByTitle().retrieve(title) ;
if (id == null)
return null ;
Page page = Page.createPage(env, id) ;
if (page.getType() == PageType.category)
return (Category) page ;
else
return null ;
}
/**
* Returns the Template referenced by the given (case sensitive) title.
*
* The given title must be matched exactly to return a Template.
*
* @param title the title of a Template.
* @return the Template referenced by the given title, or null if one does not exist
*/
public Template getTemplateByTitle(String title) {
title = title.substring(0,1).toUpperCase() + title.substring(1) ;
Integer id = env.getDbTemplatesByTitle().retrieve(title) ;
if (id == null)
return null ;
Page page = Page.createPage(env, id) ;
if (page.getType() == PageType.template)
return (Template) page ;
else
return null ;
}
/**
* Returns the most likely article for a given term. For example, searching for "tree" will return
* the article "30579: Tree", rather than "30806: Tree (data structure)" or "7770: Christmas tree"
* This is defined by the number of times the term is used as an anchor for links to each of these
* destinations.
* <p>
* An optional text processor (may be null) can be used to alter the way labels are
* retrieved (e.g. via stemming or case folding)
*
* @param term the term to obtain articles for
* @param tp an optional TextProcessor to modify how the term is searched for.
*
* @return the most likely sense of the given term.
*
* for the given text processor.
*/
public Article getMostLikelyArticle(String term, TextProcessor tp){
Label label = new Label(env, term, tp) ;
if (!label.exists())
return null ;
return label.getSenses()[0] ;
}
/**
* A convenience method for quickly finding out if the given text is ever used as a label
* in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets.
*
* @param text the text to search for
* @param tp an optional TextProcessor (may be null)
* @return true if there is an anchor corresponding to the given text, otherwise false
*/
public boolean isLabel(String text, TextProcessor tp) {
DbLabel lbl = env.getDbLabel(tp).retrieve(text) ;
return lbl != null ;
}
public Label getLabel(NGramSpan span, String sourceText) {
//System.out.println("context: " + span.getCaseContext()) ;
String ngram = span.getNgram(sourceText) ;
Label bestLabel = getLabel(ngram) ;
//don't bother trying out different casing variations if we are using casefolder as text processor
//TextProcessor tp = getConfig().getDefaultTextProcessor() ;
//if (tp != null && (tp.class. == TextProcessor.class)
/// return bestLabel ;
//if this starts with capital letter and is at start of sentence, try lower-casing that first letter.
if (span.getCaseContext() == CaseContext.mixed && span.isSentenceStart() && Character.isUpperCase(ngram.charAt(0))) {
//System.out.println("trying lower first letter first token") ;
char tmpNgram[] = ngram.toCharArray() ;
tmpNgram[0] = Character.toLowerCase(tmpNgram[0]) ;
Label label = getLabel(new String(tmpNgram)) ;
//System.out.println(label.getText()) ;
if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
bestLabel = label ;
//System.out.println("using lower first letter first token") ;
}
}
//if surrounding text is all lower case or ALL UPPER CASE, try with First Letter Of Each Token Uppercased.
if (span.getCaseContext() == CaseContext.lower || span.getCaseContext() == CaseContext.upper) {
//System.out.println("trying upper first letter all tokens") ;
Label label = getLabel(span.getNgramUpperFirst(sourceText)) ;
if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
bestLabel = label ;
//System.out.println("using upper first letter all tokens") ;
}
}
//if surrounding text is ALL UPPER CASE or Has First Letter Of Each Token Uppercased, try with all lower case
if (span.getCaseContext() == CaseContext.upperFirst || span.getCaseContext() == CaseContext.upper) {
//System.out.println("trying lower") ;
Label label = getLabel(ngram.toLowerCase()) ;
if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
bestLabel = label ;
//System.out.println("using lower") ;
}
}
//if surrounding text is all lower case, try with ALL UPPER CASE
if (span.getCaseContext() == CaseContext.lower) {
//System.out.println("trying upper") ;
Label label = getLabel(ngram.toUpperCase()) ;
if (label.exists() && (!bestLabel.exists() || label.getLinkOccCount() > bestLabel.getLinkOccCount())) {
//System.out.println("using upper") ;
bestLabel = label ;
}
}
return bestLabel ;
}
public Label getLabel(String text) {
return new Label(env, text) ;
}
public Label getLabel(String text, TextProcessor tp) {
return new Label(env, text, tp) ;
}
/**
* Returns an iterator for all pages in the database, in order of ascending ids.
*
* @return an iterator for all pages in the database, in order of ascending ids.
*/
public PageIterator getPageIterator() {
return new PageIterator(env) ;
}
/**
* Returns an iterator for all pages in the database of the given type, in order of ascending ids.
*
* @param type the type of page of interest
* @return an iterator for all pages in the database of the given type, in order of ascending ids.
*/
public PageIterator getPageIterator(PageType type) {
return new PageIterator(env, type) ;
}
/**
* Returns an iterator for all labels in the database, processed according to the given text processor (may be null), in alphabetical order.
*
* @param tp the text processor
* @return an iterator for all labels in the database, processed according to the given text processor (may be null), in alphabetical order.
*/
public LabelIterator getLabelIterator(TextProcessor tp) {
return new LabelIterator(env, tp) ;
}
/**
* Tidily closes the database environment behind this wikipedia instance. This should be done whenever
* one is finished using it.
*/
public void close() {
env.close();
this.env = null ;
}
@Override
public void finalize() {
if (this.env != null)
Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ;
}
}