package org.wikipedia.miner.db; import gnu.trove.set.hash.TIntHashSet; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Map; import java.util.TreeMap; import org.apache.hadoop.record.CsvRecordInput; import org.wikipedia.miner.db.WDatabase.DatabaseType; import org.wikipedia.miner.db.struct.DbPage; import org.wikipedia.miner.model.Page.PageType; import org.wikipedia.miner.util.ProgressTracker; import org.wikipedia.miner.util.WikipediaConfiguration; import com.sleepycat.bind.EntryBinding; import com.sleepycat.bind.tuple.IntegerBinding; import com.sleepycat.bind.tuple.StringBinding; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; public class TitleDatabase extends WDatabase<String,Integer>{ public TitleDatabase(WEnvironment env, DatabaseType type) { super(env, type, new StringBinding(), new IntegerBinding()); if (type != DatabaseType.articlesByTitle && type != DatabaseType.categoriesByTitle && type != DatabaseType.templatesByTitle) throw new IllegalArgumentException("type must be either DatabaseType.articlesByTitle, DatabaseType.categoriesByTitle or DatabaseType.templatesByTitle") ; } @Override public WEntry<String, Integer> deserialiseCsvRecord(CsvRecordInput record) throws IOException { Integer id = record.readInt(null) ; DbPage p = new DbPage() ; p.deserialize(record) ; PageType pageType = PageType.values()[p.getType()]; DatabaseType dbType = getType() ; if (dbType == DatabaseType.articlesByTitle && (pageType != PageType.article && pageType != PageType.disambiguation && pageType != PageType.redirect)) return null ; if (dbType == DatabaseType.categoriesByTitle && pageType != PageType.category) return null ; if (dbType == DatabaseType.templatesByTitle && pageType != PageType.template) return null ; return new WEntry<String,Integer>(p.getTitle(), id) ; } @Override public Integer filterCacheEntry(WEntry<String, Integer> e, WikipediaConfiguration conf) { TIntHashSet validIds = conf.getArticlesOfInterest() ; if (getType() == DatabaseType.articlesByTitle) { if (validIds != null && !validIds.contains(e.getValue())) return null ; } return e.getValue(); } @Override public void loadFromCsvFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException { if (exists() && !overwrite) return ; if (tracker == null) tracker = new ProgressTracker(1, WDatabase.class) ; tracker.startTask(dataFile.length(), "Loading " + getName()) ; BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8")) ; long bytesRead = 0 ; int lineNum = 0 ; TreeMap<String, Integer> tmp = new TreeMap<String, Integer>() ; String line ; while ((line=input.readLine()) != null) { bytesRead = bytesRead + line.length() + 1 ; lineNum++ ; CsvRecordInput cri = new CsvRecordInput(new ByteArrayInputStream((line + "\n").getBytes("UTF-8"))) ; WEntry<String,Integer> entry = deserialiseCsvRecord(cri) ; if (entry != null) { tmp.put(entry.getKey(), entry.getValue()) ; tracker.update(bytesRead) ; } } input.close(); Database db = getDatabase(false) ; for (Map.Entry<String, Integer> entry: tmp.entrySet()) { DatabaseEntry k = new DatabaseEntry() ; keyBinding.objectToEntry(entry.getKey(), k) ; DatabaseEntry v = new DatabaseEntry() ; valueBinding.objectToEntry(entry.getValue(), v) ; db.put(null, k, v) ; //TODO: progress update } input.close(); env.cleanAndCheckpoint() ; getDatabase(true) ; } }