package org.wikipedia.miner.db;
import gnu.trove.map.hash.THashMap;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import javax.xml.stream.XMLStreamException;
import org.apache.hadoop.record.CsvRecordInput;
import org.apache.log4j.Logger;
import org.wikipedia.miner.db.struct.*;
import org.wikipedia.miner.util.ProgressTracker;
import org.wikipedia.miner.util.WikipediaConfiguration;
import com.sleepycat.bind.EntryBinding;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
/**
* A wrapper for {@link Database} that adds the ability to load itself from data files and selectively caching itself to memory.
*
* It is unlikely that you will want to use this class directly.
*
* @param <K> the key type
* @param <V> the value type
*/
public abstract class WDatabase<K,V> {
/**
* Database types
*/
public enum DatabaseType
{
/**
* Associates page ids with the title, type and generality of the page.
*/
page,
/**
* Associates String labels with the statistics about the articles (senses) these labels could refer to
*/
label,
/**
* Associates Integer page ids with the labels used to refer to that page
*/
pageLabel,
/**
* Associates String titles with the id of the page within the article namespace that this refers to
*/
articlesByTitle,
/**
* Associates String titles with the id of the page within the category namespace that this refers to
*/
categoriesByTitle,
/**
* Associates String titles with the id of the page within the template namespace that this refers to
*/
templatesByTitle,
/**
* Associates integer ids with the ids of articles that link to it, and the sentence indexes where these links are found
*/
pageLinksIn,
/**
* Associates integer ids with the ids of articles that link to it
*/
pageLinksInNoSentences,
/**
* Associates integer ids with the ids of articles that it links to, and the sentence indexes where these links are found
*/
pageLinksOut,
/**
* Associates integer ids with the ids of articles that it links to
*/
pageLinksOutNoSentences,
/**
* Associates integer ids with counts of how many pages it links to or that link to it
*/
pageLinkCounts,
/**
* Associates integer ids of categories with the ids of categories it belongs to
*/
categoryParents,
/**
* Associates integer ids of articles with the ids of categories it belongs to
*/
articleParents,
/**
* Associates integer ids of categories with the ids of categories that belong to it
*/
childCategories,
/**
* Associates integer ids of categories with the ids of articles that belong to it
*/
childArticles,
/**
* Associates integer id of redirect with the id of its target
*/
redirectTargetBySource,
/**
* Associates integer id of article with the redirects that target it
*/
redirectSourcesByTarget,
/**
* Associates integer id of page with the character indexes of sentence breaks within it
*/
sentenceSplits,
/**
* Associates integer id of page with a {@link DbTranslations}.
*/
translations,
/**
* Associates integer id of page with its content, in mediawiki markup format
*/
markup,
/**
* Associates integer {@link WEnvironment.StatisticName#ordinal()} with the value relevant to this statistic.
*/
statistics
}
/**
* Options for caching data to memory
*/
public enum CachePriority {
/**
* Focus on speed, by storing values directly
*/
speed,
/**
* Focus on memory, by compressing values before storing them.
*/
space
}
private String name ;
private DatabaseType type ;
protected WEnvironment env ;
private Database database ;
protected EntryBinding<K> keyBinding ;
protected EntryBinding<V> valueBinding ;
private boolean isCached = false ;
private CachePriority cachePriority = CachePriority.space ;
private THashMap<K,byte[]> compactCache = null ;
private THashMap<K,V> fastCache = null ;
/**
* Creates or connects to a database, whose name will match the given {@link WDatabase.DatabaseType}
*
* @param env the WEnvironment surrounding this database
* @param type the type of database
* @param keyBinding a binding for serialising and deserialising keys
* @param valueBinding a binding for serialising and deserialising values
*/
public WDatabase(WEnvironment env, DatabaseType type, EntryBinding<K> keyBinding, EntryBinding<V> valueBinding) {
this.env = env ;
this.type = type ;
this.name = type.name() ;
this.keyBinding = keyBinding ;
this.valueBinding = valueBinding ;
this.database = null ;
}
/**
* Creates or connects to a database with the given name.
*
* @param env the WEnvironment surrounding this database
* @param type the type of database
* @param name the name of the database
* @param keyBinding a binding for serialising and deserialising keys
* @param valueBinding a binding for serialising and deserialising values
*/
public WDatabase(WEnvironment env, DatabaseType type, String name, EntryBinding<K> keyBinding, EntryBinding<V> valueBinding) {
this.env = env ;
this.type = type ;
this.name = name ;
this.keyBinding = keyBinding ;
this.valueBinding = valueBinding ;
this.database = null ;
}
/**
* Returns the type of this database
*
* @return the type of this database
*/
public DatabaseType getType() {
return type ;
}
/**
* Returns the name of this database
*
* @return the name of this database
*/
public String getName() {
return name ;
}
/**
* Returns the number of entries in the database
*
* @return the number of entries in the database
*/
public long getDatabaseSize() {
return getDatabase(true).count();
}
/**
* Returns the number of entries that have been cached to memory
*
* @return the number of entries that have been cached to memory
*/
public long getCacheSize() {
if (!isCached)
return 0 ;
if (cachePriority == CachePriority.space)
return fastCache.size();
else
return compactCache.size();
}
/**
* Returns true if this has been cached to memory, otherwise false
*
* @return true if this has been cached to memory, otherwise false
*/
public boolean isCached() {
return isCached ;
}
/**
* Returns whether this has been cached for speed or memory efficiency
*
* @return whether this has been cached for speed or memory efficiency
*/
public CachePriority getCachePriority() {
return cachePriority ;
}
/**
* true if there is a persistent database underlying this, otherwise false
*
* @return true if there is a persistent database underlying this, otherwise false
*/
public boolean exists() {
try {
getDatabase(true) ;
} catch(DatabaseNotFoundException e) {
return false ;
}
return true ;
}
/**
* Retrieves the value associated with the given key, either from the persistent database, or from memory if
* the database has been cached. This will return null if the key is not found, or has been excluded from the cache.
*
* @param key the key to search for
* @return the value associated with the given key, or null if none exists.
*/
public V retrieve(K key) {
if (isCached) {
//System.out.println("c") ;
return retrieveFromCache(key) ;
} else {
//System.out.println("d") ;
Database db = getDatabase(true) ;
DatabaseEntry dbKey = new DatabaseEntry() ;
keyBinding.objectToEntry(key, dbKey) ;
DatabaseEntry dbValue = new DatabaseEntry() ;
OperationStatus os = db.get(null, dbKey, dbValue, LockMode.READ_COMMITTED) ;
if (!os.equals(OperationStatus.SUCCESS))
return null ;
else
return valueBinding.entryToObject(dbValue) ;
}
}
/**
* Deserialises a CSV record.
*
* @param record the CSV record to deserialise
* @return the key,value pair encoded within the record
* @throws IOException if there is a problem decoding the record
*/
public abstract WEntry<K,V> deserialiseCsvRecord(CsvRecordInput record) throws IOException ;
/**
* Decides whether an entry should be cached to memory or not, and optionally alters values before they are cached.
*
* @param e the key,value pair to be filtered
* @param conf a configuration containing options for how the database is to be cached
* @param validIds the set of article ids that are valid and should be cached
* @return the value that should be cached along with the given key, or null if it should be excluded
*/
public abstract V filterCacheEntry(WEntry<K,V> e, WikipediaConfiguration conf) ;
/**
* Builds the persistent database from a file.
*
* @param dataFile the file (typically a CSV file) containing data to be loaded
* @param overwrite true if the existing database should be overwritten, otherwise false
* @param tracker an optional progress tracker (may be null)
* @throws IOException if there is a problem reading or deserialising the given data file.
*/
public void loadFromCsvFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException {
if (exists() && !overwrite)
return ;
if (tracker == null) tracker = new ProgressTracker(1, WDatabase.class) ;
tracker.startTask(dataFile.length(), "Loading " + name + " database") ;
Database db = getDatabase(false) ;
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8")) ;
long bytesRead = 0 ;
int lineNum = 0 ;
String line ;
while ((line=input.readLine()) != null) {
bytesRead = bytesRead + line.length() + 1 ;
lineNum++ ;
CsvRecordInput cri = new CsvRecordInput(new ByteArrayInputStream((line + "\n").getBytes("UTF-8"))) ;
WEntry<K,V> entry = deserialiseCsvRecord(cri) ;
if (entry != null) {
DatabaseEntry k = new DatabaseEntry() ;
keyBinding.objectToEntry(entry.getKey(), k) ;
DatabaseEntry v = new DatabaseEntry() ;
valueBinding.objectToEntry(entry.getValue(), v) ;
db.put(null, k, v) ;
}
tracker.update(bytesRead) ;
}
input.close();
env.cleanAndCheckpoint() ;
getDatabase(true) ;
}
/**
* Selectively caches records from the database to memory, for much faster lookup.
*
* @param conf a configuration specifying how items should be cached.
* @param validIds an optional set of article ids that should be cached. Any information about articles not in this list will be excluded from the cache.
* @param tracker an optional progress tracker
* @throws IOException
* @throws DatabaseException
*/
public void cache(WikipediaConfiguration conf, ProgressTracker tracker) throws DatabaseException, IOException {
Database db = getDatabase(true) ;
this.cachePriority = conf.getCachePriority(type) ;
initializeCache() ;
if (tracker == null)
tracker = new ProgressTracker(1, WDatabase.class) ;
tracker.startTask(db.count(), "caching " + name + " database") ;
//first, try caching from file
if (conf.getDatabaseDirectory() != null) {
File dataFile = new File(conf.getDatabaseDirectory() + File.separator + name + ".csv") ;
if (dataFile.canRead()) {
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8")) ;
long lineNum = 0 ;
String line ;
while ((line=input.readLine()) != null) {
lineNum++ ;
CsvRecordInput cri = new CsvRecordInput(new ByteArrayInputStream((line + "\n").getBytes("UTF-8"))) ;
WEntry<K,V> entry = deserialiseCsvRecord(cri) ;
if (entry != null) {
V filteredValue = filterCacheEntry(entry, conf) ;
if (filteredValue != null) {
WEntry<K,V> filteredEntry = new WEntry<K,V>(entry.getKey(), filteredValue) ;
addToCache(filteredEntry) ;
}
}
tracker.update(lineNum) ;
}
input.close();
finalizeCache() ;
return;
}
}
//we haven't managed to cache from file, so let's do it from db
WIterator<K,V> iter = getIterator() ;
while (iter.hasNext()) {
WEntry<K,V> entry = iter.next();
V filteredValue = filterCacheEntry(entry, conf) ;
if (filteredValue != null) {
WEntry<K,V> filteredEntry = new WEntry<K,V>(entry.getKey(), filteredValue) ;
addToCache(filteredEntry) ;
}
tracker.update() ;
}
iter.close();
finalizeCache() ;
}
/**
* @return an iterator for the entries in this database, in ascending key order.
*/
public WIterator<K,V> getIterator() {
return new WIterator<K,V>(this) ;
}
/**
* Closes the underlying database
*/
public void close() {
if (database != null) {
database.close() ;
database = null ;
}
fastCache = null ;
compactCache = null ;
}
@Override
public void finalize() {
if (database != null) {
Logger.getLogger(WIterator.class).warn("Unclosed database '" + name + "'. You may be causing a memory leak.") ;
}
}
protected V retrieveFromCache(K key) {
if (cachePriority == CachePriority.speed) {
return fastCache.get(key) ;
} else {
byte[] cachedData = compactCache.get(key) ;
if (cachedData == null)
return null ;
DatabaseEntry dbValue = new DatabaseEntry(cachedData) ;
return valueBinding.entryToObject(dbValue) ;
}
}
protected void initializeCache() {
if (cachePriority == CachePriority.speed)
fastCache = new THashMap<K,V>() ;
else
compactCache = new THashMap<K,byte[]>() ;
}
protected void addToCache(WEntry<K,V> entry) {
if (cachePriority == CachePriority.speed) {
fastCache.put(entry.getKey(), entry.getValue()) ;
} else {
DatabaseEntry cacheValue = new DatabaseEntry() ;
valueBinding.objectToEntry(entry.getValue(), cacheValue) ;
compactCache.put(entry.getKey(), cacheValue.getData()) ;
}
}
protected void finalizeCache() {
this.isCached = true ;
}
protected Database getDatabase(boolean readOnly) throws DatabaseException {
DatabaseConfig conf = new DatabaseConfig() ;
conf.setReadOnly(readOnly) ;
conf.setAllowCreate(!readOnly) ;
conf.setExclusiveCreate(!readOnly) ;
if (database != null) {
if (database.getConfig().getReadOnly() == readOnly) {
//the database is already open as it should be.
return database ;
} else {
//the database needs to be closed and re-opened.
database.close();
}
}
if (!readOnly) {
try {
env.getEnvironment().removeDatabase(null, name) ;
} catch (DatabaseNotFoundException e) {} ;
}
database = env.getEnvironment().openDatabase(null, name, conf);
return database ;
}
}