package org.wikipedia.miner.db;
import gnu.trove.set.hash.TIntHashSet;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.HashMap;
import javax.xml.stream.XMLStreamException;
import com.sleepycat.je.*;
import org.apache.log4j.Logger;
import org.wikipedia.miner.db.WDatabase.DatabaseType;
import org.wikipedia.miner.db.struct.*;
import org.wikipedia.miner.model.Wikipedia;
import org.wikipedia.miner.util.ProgressTracker;
import org.wikipedia.miner.util.WikipediaConfiguration;
import org.wikipedia.miner.util.text.TextProcessor;
/**
* A wrapper for {@link Environment}, that keeps track of all of the databases required for a single dump of Wikipedia.
*
* It is unlikely that you will want to work with this class directly: use {@link Wikipedia} instead.
*/
public class WEnvironment {
/**
* Statistics available about a wikipedia dump
*/
public enum StatisticName
{
/**
* The number of articles (not disambiguations or redirects) available
*/
articleCount,
/**
* The number of categories available
*/
categoryCount,
/**
* The number of disambiguation pages available
*/
disambiguationCount,
/**
* The number of redirects available
*/
redirectCount,
/**
* A long value representation of the date and time this dump was last edited -- use new Date(long) to get to parse
*/
lastEdit,
/**
* The maximum path length between articles and the root category
*/
maxCategoryDepth,
/**
* The id of root category, below which all articles should be organized
*/
rootCategoryId
}
private WikipediaConfiguration conf ;
private Environment env ;
private PreparationThread prepThread ;
private WDatabase<Integer, DbPage> dbPage ;
private LabelDatabase dbLabel ;
private HashMap<String, LabelDatabase> processedLabelDbs ;
private WDatabase<Integer, DbLabelForPageList> dbLabelsForPage ;
private WDatabase<String,Integer> dbArticlesByTitle ;
private WDatabase<String,Integer> dbCategoriesByTitle ;
private WDatabase<String,Integer> dbTemplatesByTitle ;
private WDatabase<Integer,Integer> dbRedirectTargetBySource ;
private WDatabase<Integer,DbIntList> dbRedirectSourcesByTarget ;
private WDatabase<Integer, DbLinkLocationList> dbPageLinkIn ;
private WDatabase<Integer, DbIntList> dbPageLinkInNoSentences ;
private WDatabase<Integer, DbLinkLocationList> dbPageLinkOut ;
private WDatabase<Integer, DbIntList> dbPageLinkOutNoSentences ;
private PageLinkCountDatabase dbPageLinkCounts ;
private WDatabase<Integer, DbIntList> dbCategoryParents ;
private WDatabase<Integer, DbIntList> dbArticleParents ;
private WDatabase<Integer, DbIntList> dbChildCategories ;
private WDatabase<Integer, DbIntList> dbChildArticles ;
private MarkupDatabase dbMarkup ;
private WDatabase<Integer, DbIntList> dbSentenceSplits ;
private WDatabase<Integer, DbTranslations> dbTranslations ;
private WDatabase<Integer, Long> dbStatistics ;
@SuppressWarnings("unchecked")
private HashMap<DatabaseType, WDatabase> databasesByType ;
/**
* Returns the configuration of this environment
*
* @return the configuration of this environment
*/
public WikipediaConfiguration getConfiguration() {
return conf ;
}
/**
* Returns the {@link DatabaseType#page} database
*
* @return see {@link DatabaseType#page}
*/
public WDatabase<Integer, DbPage> getDbPage() {
return dbPage;
}
/**
* Returns the {@link DatabaseType#label} database for the given text processor
*
* @param textProcessor the text processor that should be applied to labels before indexing or searching (or null if the original label database is required)
* @return see {@link DatabaseType#label}
*/
public LabelDatabase getDbLabel(TextProcessor textProcessor) {
if (textProcessor == null)
return dbLabel;
else {
LabelDatabase db = processedLabelDbs.get(textProcessor.getName()) ;
if (db == null) {
db = new LabelDatabase(this, textProcessor) ;
processedLabelDbs.put(textProcessor.getName(), db) ;
}
return db ;
}
}
/**
* Returns the {@link DatabaseType#pageLabel} database
*
* @return see {@link DatabaseType#pageLabel}
*/
public WDatabase<Integer, DbLabelForPageList> getDbLabelsForPage() {
return dbLabelsForPage ;
}
/**
* Returns the {@link DatabaseType#articlesByTitle} database
*
* @return see {@link DatabaseType#articlesByTitle}
*/
public WDatabase<String, Integer> getDbArticlesByTitle() {
return dbArticlesByTitle ;
}
/**
* Returns the {@link DatabaseType#categoriesByTitle} database
*
* @return see {@link DatabaseType#categoriesByTitle}
*/
public WDatabase<String, Integer> getDbCategoriesByTitle() {
return dbCategoriesByTitle ;
}
/**
* Returns the {@link DatabaseType#templatesByTitle} database
*
* @return see {@link DatabaseType#templatesByTitle}
*/
public WDatabase<String, Integer> getDbTemplatesByTitle() {
return dbTemplatesByTitle ;
}
/**
* Returns the {@link DatabaseType#redirectTargetBySource} database
*
* @return see {@link DatabaseType#redirectTargetBySource}
*/
public WDatabase<Integer, Integer> getDbRedirectTargetBySource() {
return dbRedirectTargetBySource ;
}
/**
* Returns the {@link DatabaseType#redirectSourcesByTarget} database
*
* @return see {@link DatabaseType#redirectSourcesByTarget}
*/
public WDatabase<Integer, DbIntList> getDbRedirectSourcesByTarget() {
return dbRedirectSourcesByTarget ;
}
/**
* Returns the {@link DatabaseType#pageLinksIn} database
*
* @return see {@link DatabaseType#pageLinksIn}
*/
public WDatabase<Integer, DbLinkLocationList> getDbPageLinkIn() {
return dbPageLinkIn;
}
/**
* Returns the {@link DatabaseType#pageLinksInNoSentences} database
*
* @return see {@link DatabaseType#pageLinksInNoSentences}
*/
public WDatabase<Integer, DbIntList> getDbPageLinkInNoSentences() {
return dbPageLinkInNoSentences;
}
/**
* Returns the {@link DatabaseType#pageLinksOut} database
*
* @return see {@link DatabaseType#pageLinksOut}
*/
public WDatabase<Integer, DbLinkLocationList> getDbPageLinkOut() {
return dbPageLinkOut;
}
/**
* Returns the {@link DatabaseType#pageLinksOutNoSentences} database
*
* @return see {@link DatabaseType#pageLinksOutNoSentences}
*/
public WDatabase<Integer, DbIntList> getDbPageLinkOutNoSentences() {
return dbPageLinkOutNoSentences;
}
/**
* Returns the {@link DatabaseType#pageLinkCounts} database
*
* @return see {@link DatabaseType#pageLinkCounts}
*/
public WDatabase<Integer, DbPageLinkCounts> getDbPageLinkCounts() {
return dbPageLinkCounts;
}
/**
* Returns the {@link DatabaseType#categoryParents} database
*
* @return see {@link DatabaseType#categoryParents}
*/
public WDatabase<Integer, DbIntList> getDbCategoryParents() {
return dbCategoryParents;
}
/**
* Returns the {@link DatabaseType#articleParents} database
*
* @return see {@link DatabaseType#articleParents}
*/
public WDatabase<Integer, DbIntList> getDbArticleParents() {
return dbArticleParents;
}
/**
* Returns the {@link DatabaseType#childCategories} database
*
* @return see {@link DatabaseType#childCategories}
*/
public WDatabase<Integer, DbIntList> getDbChildCategories() {
return dbChildCategories;
}
/**
* Returns the {@link DatabaseType#childArticles} database
*
* @return see {@link DatabaseType#childArticles}
*/
public WDatabase<Integer, DbIntList> getDbChildArticles() {
return dbChildArticles;
}
/**
* Returns the {@link DatabaseType#markup} database
*
* @return see {@link DatabaseType#markup}
*/
public MarkupDatabase getDbMarkup() {
return dbMarkup;
}
/**
* Returns the {@link DatabaseType#sentenceSplits} database
*
* @return see {@link DatabaseType#sentenceSplits}
*/
public WDatabase<Integer, DbIntList> getDbSentenceSplits() {
return dbSentenceSplits;
}
/**
* Returns the {@link DatabaseType#translations} database
*
* @return see {@link DatabaseType#translations}
*/
public WDatabase<Integer, DbTranslations> getDbTranslations() {
return dbTranslations ;
}
/**
* Intitializes the environment defined in the given configuration, and immediately begins connecting to databases and caching them to memory.
*
* This preparation can be done in a separate thread if required, in which case progress can be tracked using {@link #getProgress()}, {@link #getPreparationTracker()} and {@link #isReady()}.
*
* @param conf configuration options
* @param threaded true if this should be prepared (e.g. cached to memory) in a separate thread, otherwise false
* @throws EnvironmentLockedException if the underlying {@link Environment} is unavailable
*/
public WEnvironment(WikipediaConfiguration conf, boolean threaded) throws EnvironmentLockedException {
this.conf = conf ;
EnvironmentConfig envConf = new EnvironmentConfig() ;
envConf.setAllowCreate(false) ;
envConf.setReadOnly(true) ;
envConf.setCachePercent(10) ;
env = new Environment(conf.getDatabaseDirectory(), envConf) ;
initDatabases() ;
prepThread = new PreparationThread(conf) ;
if (threaded)
prepThread.start() ;
else
prepThread.doPreparation() ;
}
private WEnvironment(WikipediaConfiguration conf) {
this.conf = conf ;
initDatabases() ;
EnvironmentConfig envConf = new EnvironmentConfig() ;
envConf.setCachePercent(10) ;
envConf.setAllowCreate(true) ;
envConf.setReadOnly(false) ;
env = new Environment(conf.getDatabaseDirectory(), envConf) ;
}
@SuppressWarnings("unchecked")
private void initDatabases() {
WDatabaseFactory dbFactory = new WDatabaseFactory(this) ;
databasesByType = new HashMap<DatabaseType, WDatabase>() ;
dbPage = dbFactory.buildPageDatabase() ;
databasesByType.put(DatabaseType.page, dbPage) ;
dbLabel = dbFactory.buildLabelDatabase() ;
databasesByType.put(DatabaseType.label, dbLabel) ;
processedLabelDbs = new HashMap<String, LabelDatabase>() ;
dbLabelsForPage = dbFactory.buildPageLabelDatabase() ;
databasesByType.put(DatabaseType.pageLabel, dbLabelsForPage) ;
dbArticlesByTitle = dbFactory.buildTitleDatabase(DatabaseType.articlesByTitle) ;
databasesByType.put(DatabaseType.articlesByTitle, dbArticlesByTitle) ;
dbCategoriesByTitle = dbFactory.buildTitleDatabase(DatabaseType.categoriesByTitle) ;
databasesByType.put(DatabaseType.categoriesByTitle, dbCategoriesByTitle) ;
dbTemplatesByTitle = dbFactory.buildTitleDatabase(DatabaseType.templatesByTitle) ;
databasesByType.put(DatabaseType.templatesByTitle, dbTemplatesByTitle) ;
dbPageLinkIn = dbFactory.buildPageLinkDatabase(DatabaseType.pageLinksIn) ;
databasesByType.put(DatabaseType.pageLinksIn, dbPageLinkIn) ;
dbPageLinkInNoSentences = dbFactory.buildPageLinkNoSentencesDatabase(DatabaseType.pageLinksInNoSentences) ;
databasesByType.put(DatabaseType.pageLinksInNoSentences, dbPageLinkInNoSentences) ;
dbPageLinkOut = dbFactory.buildPageLinkDatabase(DatabaseType.pageLinksOut) ;
databasesByType.put(DatabaseType.pageLinksOut, dbPageLinkOut) ;
dbPageLinkOutNoSentences = dbFactory.buildPageLinkNoSentencesDatabase(DatabaseType.pageLinksOutNoSentences) ;
databasesByType.put(DatabaseType.pageLinksOutNoSentences, dbPageLinkOutNoSentences) ;
dbPageLinkCounts = dbFactory.buildPageLinkCountDatabase() ;
databasesByType.put(DatabaseType.pageLinkCounts, dbPageLinkCounts) ;
dbCategoryParents = dbFactory.buildIntIntListDatabase(DatabaseType.categoryParents) ;
databasesByType.put(DatabaseType.categoryParents, dbCategoryParents) ;
dbArticleParents = dbFactory.buildIntIntListDatabase(DatabaseType.articleParents) ;
databasesByType.put(DatabaseType.articleParents, dbArticleParents) ;
dbChildCategories = dbFactory.buildIntIntListDatabase(DatabaseType.childCategories) ;
databasesByType.put(DatabaseType.childCategories, dbChildCategories) ;
dbChildArticles = dbFactory.buildIntIntListDatabase(DatabaseType.childArticles) ;
databasesByType.put(DatabaseType.childArticles, dbChildArticles) ;
dbRedirectSourcesByTarget = dbFactory.buildIntIntListDatabase(DatabaseType.redirectSourcesByTarget) ;
databasesByType.put(DatabaseType.redirectSourcesByTarget, dbRedirectSourcesByTarget) ;
dbRedirectTargetBySource = dbFactory.buildRedirectTargetBySourceDatabase() ;
databasesByType.put(DatabaseType.redirectTargetBySource, dbRedirectTargetBySource) ;
dbMarkup = new MarkupDatabase(this) ;
databasesByType.put(DatabaseType.markup, dbMarkup) ;
dbSentenceSplits = dbFactory.buildIntIntListDatabase(DatabaseType.sentenceSplits) ;
databasesByType.put(DatabaseType.sentenceSplits, dbSentenceSplits) ;
dbTranslations = dbFactory.buildTranslationsDatabase() ;
databasesByType.put(DatabaseType.translations, dbTranslations) ;
dbStatistics = dbFactory.buildStatisticsDatabase() ;
databasesByType.put(DatabaseType.statistics, dbStatistics) ;
}
/**
* @return true if the preparation work has been completed, otherwise false
*/
public boolean isReady() {
return prepThread.isCompleted() ;
}
/**
* @return a number between 0 (just started) and 1 (completed) indicating progress of the preparation work.
*/
public double getProgress() {
return prepThread.getProgress() ;
}
/**
* @return a tracker for progress of the preparation work.
*/
public ProgressTracker getPreparationTracker() {
return prepThread.getTracker() ;
}
/**
* @param sn the name of the desired statistic
* @return the value of the desired statistic
*/
public Long retrieveStatistic(StatisticName sn) {
return dbStatistics.retrieve(sn.ordinal()) ;
}
/**
* @param tp a text processor
* @return true if the environment is ready to be searched for labels using the given text processor, otherwise false
*/
public boolean isPreparedFor(TextProcessor tp) {
LabelDatabase db = getDbLabel(tp) ;
return db.exists() ;
}
/**
* Identifies the set of valid article ids which fit the given constrains. Useful for specifying a subset of
* articles that we are interested in caching.
*
* @param minLinkCount the minimum number of links that an article must receive from other articles to be included
* @param tracker an optional progress notifier
* @return the set of valid ids which fit the given constrains.
*/
public TIntHashSet getValidArticleIds(int minLinkCount, ProgressTracker tracker) {
//TODO: ideally this should advance a page iterator at the same time, to check page type
//TODO: ideally this should advance a pageLinkOut iterator at the same time, to check minimum outlinks
TIntHashSet pageIds = new TIntHashSet() ;
if (tracker == null) tracker = new ProgressTracker(1, WEnvironment.class) ;
tracker.startTask(dbPageLinkIn.getDatabaseSize(), "gathering valid page ids") ;
WIterator<Integer, DbLinkLocationList> iter = dbPageLinkIn.getIterator() ;
while (iter.hasNext()) {
WEntry<Integer, DbLinkLocationList> e = iter.next() ;
if (e.getValue().getLinkLocations().size() > minLinkCount)
pageIds.add(e.getKey()) ;
tracker.update();
}
iter.close() ;
return pageIds ;
}
protected void cleanAndCheckpoint() throws DatabaseException{
Logger.getLogger(WEnvironment.class).info("Starting cleaning") ;
boolean anyCleaned = false;
while (env.cleanLog() > 0) {
System.out.println("cleaning") ;
anyCleaned = true;
}
Logger.getLogger(WEnvironment.class).info("Finished cleaning") ;
if (anyCleaned) {
Logger.getLogger(WEnvironment.class).info("Starting checkpoint") ;
CheckpointConfig force = new CheckpointConfig();
force.setForce(true);
env.checkpoint(force);
Logger.getLogger(WEnvironment.class).info("Finished checkpoint") ;
}
}
@SuppressWarnings("unchecked")
private WDatabase getDatabase(DatabaseType dbType) {
return databasesByType.get(dbType) ;
}
private class PreparationThread extends Thread {
WikipediaConfiguration conf ;
private ProgressTracker tracker ;
private boolean completed = false ;
private Exception failureCause = null ;
PreparationThread(WikipediaConfiguration conf) {
this.conf = conf ;
}
public boolean isCompleted() {
return completed ;
}
public boolean failed() {
return (failureCause != null) ;
}
public double getProgress() {
if (completed)
return 1 ;
if (tracker == null)
return 0 ;
return tracker.getGlobalProgress() ;
}
public ProgressTracker getTracker() {
return tracker ;
}
@Override
public void run() {
doPreparation() ;
}
public void doPreparation() {
boolean mustGatherIds = (conf.getMinLinksIn() > 0 && !conf.getDatabasesToCache().isEmpty()) && conf.getArticlesOfInterest() == null ;
int taskCount = conf.getDatabasesToCache().size() + 1;
if (mustGatherIds)
taskCount++ ;
tracker = new ProgressTracker(taskCount, WEnvironment.class) ;
try {
tracker.startTask(1, "Connecting to database") ;
dbStatistics.cache(conf, null) ;
tracker.update();
if (mustGatherIds)
conf.setArticlesOfInterest(getValidArticleIds(conf.getMinLinksIn(), tracker)) ;
for(DatabaseType dbName:conf.getDatabasesToCache()) {
if (dbName == DatabaseType.label)
getDbLabel(conf.getDefaultTextProcessor()).cache(conf, tracker) ;
else
getDatabase(dbName).cache(conf, tracker) ;
}
conf.setArticlesOfInterest(null) ;
System.gc() ;
} catch (Exception e) {
failureCause = e ;
} ;
completed = true ;
}
}
public Exception getCachingFailureReason() {
if (this.prepThread == null)
return null ;
return this.prepThread.failureCause ;
}
/**
* Tidily closes the environment, and all databases within it. This should always be called once you are finished with the environment.
*/
@SuppressWarnings("unchecked")
public void close() {
for (WDatabase<String, DbLabel> dbProcessedLabel: processedLabelDbs.values()) {
dbProcessedLabel.close();
}
for (WDatabase db:this.databasesByType.values()) {
db.close() ;
}
}
@Override
public void finalize() {
if (env != null) {
Logger.getLogger(WIterator.class).warn("Unclosed enviroment. You may be causing a memory leak.") ;
}
}
/**
* Builds a WEnvironment, by loading all of the data files stored in the given directory into persistent databases.
*
* It will not create the environment or any databases unless all of the required files are found in the given directory.
*
* It will not delete any existing databases, and will only overwrite them if explicitly specified (even if they are incomplete).
*
* @param conf a configuration specifying where the databases are to be stored, etc.
* @param dataDirectory a directory containing the a single XML dump of wikipedia, and all of the CSV files produced by {@link DumpExtractor}
* @param overwrite true if existing databases should be overwritten, otherwise false
* @throws IOException if any of the required files cannot be read
* @throws XMLStreamException if the XML dump of wikipedia cannot be parsed
*/
public static void buildEnvironment(WikipediaConfiguration conf, File dataDirectory, boolean overwrite) throws IOException, XMLStreamException {
//check all files exist and are readable before doing anything
File statistics = getDataFile(dataDirectory, "stats.csv") ;
File page = getDataFile(dataDirectory, "page.csv") ;
File label = getDataFile(dataDirectory, "label.csv") ;
File pageLabel = getDataFile(dataDirectory, "pageLabel.csv") ;
File pageLinksIn = getDataFile(dataDirectory, "pageLinkIn.csv") ;
File pageLinksOut = getDataFile(dataDirectory, "pageLinkOut.csv") ;
File categoryParents = getDataFile(dataDirectory, "categoryParents.csv") ;
File articleParents = getDataFile(dataDirectory, "articleParents.csv") ;
File childCategories = getDataFile(dataDirectory, "childCategories.csv") ;
File childArticles = getDataFile(dataDirectory, "childArticles.csv") ;
File redirectTargetBySource = getDataFile(dataDirectory, "redirectTargetsBySource.csv") ;
File redirectSourcesByTarget = getDataFile(dataDirectory, "redirectSourcesByTarget.csv") ;
File sentenceSplits = getDataFile(dataDirectory, "sentenceSplits.csv") ;
File translations = getDataFile(dataDirectory, "translations.csv") ;
File markup = getMarkupDataFile(dataDirectory) ;
//now load databases
if (!conf.getDatabaseDirectory().exists())
conf.getDatabaseDirectory().mkdirs() ;
WEnvironment env = new WEnvironment(conf) ;
env.dbStatistics.loadFromCsvFile(statistics, overwrite, null) ;
env.dbPage.loadFromCsvFile(page, overwrite, null) ;
env.dbLabel.loadFromCsvFile(label, overwrite, null) ;
env.dbLabelsForPage.loadFromCsvFile(pageLabel, overwrite, null) ;
env.dbArticlesByTitle.loadFromCsvFile(page, overwrite, null) ;
env.dbCategoriesByTitle.loadFromCsvFile(page, overwrite, null) ;
env.dbTemplatesByTitle.loadFromCsvFile(page, overwrite, null) ;
env.dbRedirectTargetBySource.loadFromCsvFile(redirectTargetBySource, overwrite, null) ;
env.dbRedirectSourcesByTarget.loadFromCsvFile(redirectSourcesByTarget, overwrite, null) ;
env.dbPageLinkIn.loadFromCsvFile(pageLinksIn, overwrite, null) ;
env.dbPageLinkInNoSentences.loadFromCsvFile(pageLinksIn, overwrite, null) ;
env.dbPageLinkOut.loadFromCsvFile(pageLinksOut, overwrite, null) ;
env.dbPageLinkOutNoSentences.loadFromCsvFile(pageLinksOut, overwrite, null) ;
env.dbPageLinkCounts.loadFromCsvFiles(pageLinksIn, pageLinksOut, overwrite, null) ;
env.dbCategoryParents.loadFromCsvFile(categoryParents, overwrite, null) ;
env.dbArticleParents.loadFromCsvFile(articleParents, overwrite, null) ;
env.dbChildCategories.loadFromCsvFile(childCategories, overwrite, null) ;
env.dbChildArticles.loadFromCsvFile(childArticles, overwrite, null) ;
env.dbSentenceSplits.loadFromCsvFile(sentenceSplits, overwrite, null) ;
env.dbTranslations.loadFromCsvFile(translations, overwrite, null) ;
env.dbMarkup.loadFromXmlFile(markup, overwrite, null) ;
env.close();
TextProcessor tp = conf.getDefaultTextProcessor() ;
if (tp != null) {
File tmpDir = new File(conf.getDataDirectory() + File.separator + "tmp" + tp.getName()) ;
tmpDir.mkdir() ;
tmpDir.deleteOnExit() ;
prepareTextProcessor(tp, conf, tmpDir, overwrite, 5) ;
}
}
/**
* Prepares the environment, so it can be searched efficiently for labels using the given text processor.
*
* Note: you can use as many different text processors as you like
*
* @see LabelDatabase#prepare(File, int)
*
* @param tp a text processor
* @param conf a configuration specifying where the databases are to be stored, etc.
* @param tempDirectory a directory for writing temporary files
* @param overwrite true if the preparation should occur even if the environment has been prepared for this processor already
* @param passes the number of the number of passes to break the task into (more = slower, but less memory required)
* @throws IOException if the temporary directory is not writable
*/
public static void prepareTextProcessor(TextProcessor tp, WikipediaConfiguration conf, File tempDirectory, boolean overwrite, int passes) throws IOException {
if (tp == null)
return ;
WEnvironment env = new WEnvironment(conf) ;
if (!overwrite && env.isPreparedFor(tp))
return ;
LabelDatabase db = env.getDbLabel(tp) ;
db.prepare(tempDirectory, passes) ;
env.cleanAndCheckpoint() ;
env.close();
}
protected Environment getEnvironment() {
return env ;
}
private static File getDataFile(File dataDirectory, String fileName) throws IOException {
File file = new File(dataDirectory + File.separator + fileName) ;
if (!file.canRead())
throw new IOException(file + " is not readable") ;
return file ;
}
private static File getMarkupDataFile(File dataDirectory) throws IOException {
File[] files = dataDirectory.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith("-pages-articles.xml") || name.endsWith("-pages-articles.xml.bz2") ;
}
}) ;
if (files.length == 0)
throw new IOException("Could not locate markup file in " + dataDirectory) ;
if (files.length > 1)
throw new IOException("There are multiple markup files in " + dataDirectory) ;
if (!files[0].canRead())
throw new IOException(files[0] + " is not readable") ;
return files[0] ;
}
}