package org.wikipedia.miner.db; import gnu.trove.set.hash.TIntHashSet; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import javax.xml.stream.XMLStreamException; import org.apache.hadoop.record.CsvRecordInput; import org.wikipedia.miner.db.struct.DbLinkLocation; import org.wikipedia.miner.db.struct.DbLinkLocationList; import org.wikipedia.miner.db.struct.DbPageLinkCounts; import org.wikipedia.miner.util.ProgressTracker; import org.wikipedia.miner.util.WikipediaConfiguration; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; public class PageLinkCountDatabase extends IntObjectDatabase<DbPageLinkCounts>{ public PageLinkCountDatabase(WEnvironment env) { super(env, DatabaseType.pageLinkCounts, new RecordBinding<DbPageLinkCounts>(){ @Override public DbPageLinkCounts createRecordInstance() { return new DbPageLinkCounts() ; } } ); } @Override public WEntry<Integer, DbPageLinkCounts> deserialiseCsvRecord( CsvRecordInput record) throws IOException { throw new UnsupportedOperationException() ; } private WEntry<Integer, DbLinkLocationList> deserializePageLinkCsvRecord(CsvRecordInput record) throws IOException { Integer id = record.readInt(null) ; DbLinkLocationList l = new DbLinkLocationList() ; l.deserialize(record) ; return new WEntry<Integer, DbLinkLocationList>(id, l) ; } private WEntry<Integer, DbPageLinkCounts> buildLinkSummaryEntry( WEntry<Integer, DbLinkLocationList> inLinkEntry, WEntry<Integer, DbLinkLocationList> outLinkEntry ) throws IOException { if (inLinkEntry==null && outLinkEntry==null) throw new IOException("both inlink and outlink entries are null") ; if (inLinkEntry != null && outLinkEntry != null && !inLinkEntry.getKey().equals(outLinkEntry.getKey())) throw new IOException("inlink and outlink records are not for the same page") ; Integer id = null ; DbPageLinkCounts linkCounts = new DbPageLinkCounts(0,0,0,0) ; if (inLinkEntry != null) { id = inLinkEntry.getKey() ; int total = 0 ; int distinct = 0 ; for (DbLinkLocation ll:inLinkEntry.getValue().getLinkLocations()) { distinct++ ; total+=ll.getSentenceIndexes().size(); } linkCounts.setTotalLinksIn(total) ; linkCounts.setDistinctLinksIn(distinct) ; } if (outLinkEntry != null) { id = outLinkEntry.getKey() ; int total = 0 ; int distinct = 0 ; for (DbLinkLocation ll:outLinkEntry.getValue().getLinkLocations()) { distinct++ ; total+=ll.getSentenceIndexes().size(); } linkCounts.setTotalLinksOut(total) ; linkCounts.setDistinctLinksOut(distinct) ; } return new WEntry<Integer, DbPageLinkCounts>(id, linkCounts) ; } @Override public DbPageLinkCounts filterCacheEntry( WEntry<Integer, DbPageLinkCounts> e, WikipediaConfiguration conf) { TIntHashSet validIds = conf.getArticlesOfInterest() ; if (validIds != null && !validIds.contains(e.getKey())) return null ; return e.getValue(); } @Override public void loadFromCsvFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException { throw new UnsupportedOperationException() ; } public void loadFromCsvFiles(File linksInFile, File linksOutFile, boolean overwrite, ProgressTracker tracker) throws IOException { if (exists() && !overwrite) return ; if (tracker == null) tracker = new ProgressTracker(1, WDatabase.class) ; tracker.startTask(linksInFile.length()+linksOutFile.length(), "Loading " + getName() + " database") ; Database db = getDatabase(false) ; BufferedReader linksInInput = new BufferedReader(new InputStreamReader(new FileInputStream(linksInFile), "UTF-8")) ; BufferedReader linksOutInput = new BufferedReader(new InputStreamReader(new FileInputStream(linksOutFile), "UTF-8")) ; long bytesRead = 0 ; String inLinkLine = linksInInput.readLine() ; bytesRead += (inLinkLine.length() + 1) ; CsvRecordInput linksInRecord = new CsvRecordInput(new ByteArrayInputStream((inLinkLine + "\n").getBytes("UTF-8"))) ; WEntry<Integer, DbLinkLocationList> inLinkEntry = deserializePageLinkCsvRecord(linksInRecord) ; String outLinkLine = linksOutInput.readLine() ; bytesRead += (outLinkLine.length() + 1) ; CsvRecordInput linksOutRecord = new CsvRecordInput(new ByteArrayInputStream((outLinkLine + "\n").getBytes("UTF-8"))) ; WEntry<Integer, DbLinkLocationList> outLinkEntry = deserializePageLinkCsvRecord(linksOutRecord) ; while (inLinkEntry != null && outLinkEntry != null) { WEntry<Integer, DbPageLinkCounts> linkCountEntry = null; boolean advanceInLinks = false; boolean advanceOutLinks = false ; if (inLinkEntry == null || outLinkEntry.getKey() < inLinkEntry.getKey()) { linkCountEntry = buildLinkSummaryEntry(null, outLinkEntry) ; advanceOutLinks = true ; } if (outLinkEntry == null || inLinkEntry.getKey() < outLinkEntry.getKey()) { linkCountEntry = buildLinkSummaryEntry(inLinkEntry, null) ; advanceInLinks = true ; } if (inLinkEntry.getKey().equals(outLinkEntry.getKey())) { linkCountEntry = buildLinkSummaryEntry(inLinkEntry, outLinkEntry) ; advanceInLinks = true ; advanceOutLinks = true ; } if (linkCountEntry != null) { DatabaseEntry k = new DatabaseEntry() ; keyBinding.objectToEntry(linkCountEntry.getKey(), k) ; DatabaseEntry v = new DatabaseEntry() ; valueBinding.objectToEntry(linkCountEntry.getValue(), v) ; db.put(null, k, v) ; } if (advanceInLinks) { inLinkLine = linksInInput.readLine() ; if (inLinkLine != null) { bytesRead += (inLinkLine.length() + 1) ; linksInRecord = new CsvRecordInput(new ByteArrayInputStream((inLinkLine + "\n").getBytes("UTF-8"))) ; inLinkEntry = deserializePageLinkCsvRecord(linksInRecord) ; } else { inLinkEntry = null ; } } if (advanceOutLinks) { outLinkLine = linksOutInput.readLine() ; if (outLinkLine != null) { bytesRead += (outLinkLine.length() + 1) ; linksOutRecord = new CsvRecordInput(new ByteArrayInputStream((outLinkLine + "\n").getBytes("UTF-8"))) ; outLinkEntry = deserializePageLinkCsvRecord(linksOutRecord) ; } else { outLinkEntry = null ; } } tracker.update(bytesRead) ; } linksInInput.close(); linksOutInput.close() ; env.cleanAndCheckpoint() ; getDatabase(true) ; } }