package org.wikipedia.miner.db;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.io.input.CountingInputStream;
import org.apache.hadoop.record.CsvRecordInput;
import org.wikipedia.miner.util.ProgressTracker;
import org.wikipedia.miner.util.WikipediaConfiguration;
import com.sleepycat.bind.tuple.IntegerBinding;
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import org.apache.tools.bzip2.* ;
/**
* A {@link WDatabase} for associating page ids with page markup.
*
* This will throw {@link UnsupportedOperationException}s if any attempt is made to cache this database to memory.
*/
public class MarkupDatabase extends WDatabase<Integer, String> {
private enum DumpTag {page, id, text, ignorable} ;
/**
* Creates or connects to a database, whose name and type will be {@link WDatabase.DatabaseType#markup}.
*
* @param env the WEnvironment surrounding this database
*/
public MarkupDatabase(WEnvironment env) {
super (env, DatabaseType.markup, new IntegerBinding(), new StringBinding()) ;
}
@Override
public String filterCacheEntry(WEntry<Integer, String> e,
WikipediaConfiguration conf) {
throw new UnsupportedOperationException() ;
}
@Override
public WEntry<Integer,String> deserialiseCsvRecord(CsvRecordInput record) throws IOException {
throw new UnsupportedOperationException() ;
}
@Override
public void loadFromCsvFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException {
throw new UnsupportedOperationException() ;
}
/**
* Builds the persistent markup database from an XML dump
*
* @param dataFile the XML file containing a wikipedia dump
* @param overwrite true if the existing database should be overwritten, otherwise false
* @param tracker an optional progress tracker (may be null)
* @throws IOException if there is a problem reading or deserialising the given data file.
* @throws XMLStreamException if the XML within the data file cannot be parsed.
*/
public void loadFromXmlFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException, XMLStreamException {
if (exists() && !overwrite)
return ;
if (tracker == null) tracker = new ProgressTracker(1, MarkupDatabase.class) ;
tracker.startTask(dataFile.length(), "Loading " + getName() + " database") ;
Database db = getDatabase(false) ;
Integer currId = null ;
String currMarkup = null ;
StringBuffer characters = new StringBuffer() ;
InputStream reader ;
if (dataFile.getName().endsWith(".bz2"))
reader = new CBZip2InputStream(new FileInputStream(dataFile)) ;
else
reader = new FileInputStream(dataFile) ;
XMLInputFactory xmlStreamFactory = XMLInputFactory.newInstance() ;
CountingInputStream countingReader = new CountingInputStream(reader) ;
XMLStreamReader xmlStreamReader = xmlStreamFactory.createXMLStreamReader(countingReader, "UTF-8") ;
int pageTotal = 0 ;
long charTotal = 0 ;
long maxChar = 0 ;
while (xmlStreamReader.hasNext()) {
int eventCode = xmlStreamReader.next();
switch (eventCode) {
case XMLStreamReader.START_ELEMENT :
switch(resolveDumpTag(xmlStreamReader.getLocalName())) {
case page:
//System.out.println(" - " + countingReader.getByteCount()) ;
}
break;
case XMLStreamReader.END_ELEMENT :
switch(resolveDumpTag(xmlStreamReader.getLocalName())) {
case id:
//only take the first id (there is a 2nd one for the revision)
if (currId == null)
currId = Integer.parseInt(characters.toString().trim()) ;
break ;
case text:
currMarkup = characters.toString().trim() ;
break ;
case page:
DatabaseEntry key = new DatabaseEntry() ;
keyBinding.objectToEntry(currId, key) ;
DatabaseEntry value = new DatabaseEntry() ;
valueBinding.objectToEntry(currMarkup, value) ;
pageTotal++ ;
charTotal = charTotal + currMarkup.length();
maxChar = Math.max(maxChar, currMarkup.length()) ;
db.put(null, key, value) ;
currId = null ;
currMarkup = null ;
tracker.update(countingReader.getByteCount()) ;
}
characters = new StringBuffer() ;
break;
case XMLStreamReader.CHARACTERS :
characters.append(xmlStreamReader.getText()) ;
}
}
xmlStreamReader.close();
env.cleanAndCheckpoint() ;
getDatabase(true) ;
}
private DumpTag resolveDumpTag(String tagName) {
try {
return DumpTag.valueOf(tagName) ;
} catch (IllegalArgumentException e) {
return DumpTag.ignorable ;
}
}
}