package org.wikipedia.miner.db; import gnu.trove.set.hash.TIntHashSet; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Map; import java.util.TreeMap; import org.apache.hadoop.record.CsvRecordInput; import org.apache.log4j.Logger; import org.wikipedia.miner.db.WDatabase.DatabaseType; import org.wikipedia.miner.db.WEnvironment.StatisticName; import org.wikipedia.miner.db.struct.*; import org.wikipedia.miner.model.Page.PageType; import org.wikipedia.miner.util.ProgressTracker; import org.wikipedia.miner.util.WikipediaConfiguration; import org.wikipedia.miner.util.text.TextProcessor; import com.sleepycat.bind.tuple.IntegerBinding; import com.sleepycat.bind.tuple.LongBinding; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; /** * A factory for creating WDatabases of various types */ public class WDatabaseFactory { WEnvironment env ; /** * Creates a new WDatabaseFactory for the given WEnvironment * * @param env a WEnvironment */ public WDatabaseFactory(WEnvironment env) { this.env = env ; } /** * Returns a database associating page ids with the title, type and generality of the page. * * @return a database associating page ids with the title, type and generality of the page. */ public WDatabase<Integer, DbPage> buildPageDatabase() { RecordBinding<DbPage> keyBinding = new RecordBinding<DbPage>() { public DbPage createRecordInstance() { return new DbPage() ; } } ; return new IntObjectDatabase<DbPage>( env, DatabaseType.page, keyBinding ) { @Override public WEntry<Integer,DbPage> deserialiseCsvRecord(CsvRecordInput record) throws IOException { Integer id = record.readInt(null) ; DbPage p = new DbPage() ; p.deserialize(record) ; return new WEntry<Integer,DbPage>(id, p) ; } @Override public DbPage filterCacheEntry( WEntry<Integer, DbPage> e, WikipediaConfiguration conf ) { PageType pageType = PageType.values()[e.getValue().getType()] ; TIntHashSet validIds = conf.getArticlesOfInterest() ; if (validIds == null || validIds.contains(e.getKey()) || pageType == PageType.category || pageType==PageType.redirect) return e.getValue() ; else return null ; } }; } /** * Returns a database associating article, category or template titles with their ids. * * @param {@link DatabaseType#articlesByTitle}, {@link DatabaseType#templatesByTitle} or {@link DatabaseType#categoriesByTitle}. * @return a database associating article, category or template titles with their ids. */ public WDatabase<String,Integer> buildTitleDatabase(DatabaseType type) { return new TitleDatabase(env, type) ; } /** * Returns a database associating String labels with the statistics about the articles (senses) these labels could refer to. * * @return a database associating String labels with the statistics about the articles (senses) these labels could refer to. */ public LabelDatabase buildLabelDatabase() { return new LabelDatabase(env) ; } /** * Returns a database associating String labels with the statistics about the articles (senses) these labels could refer to. * * @param tp a text processor that should be applied to string labels before indexing and searching * @return a database associating String labels with the statistics about the articles (senses) these labels could refer to */ public LabelDatabase buildLabelDatabase(TextProcessor tp) { if (tp == null) throw new IllegalArgumentException("text processor must not be null") ; return new LabelDatabase(env, tp) ; } /** * Returns a database associating Integer page ids with the labels used to refer to that page * * @return a database associating Integer page ids with the labels used to refer to that page */ public WDatabase<Integer,DbLabelForPageList> buildPageLabelDatabase() { RecordBinding<DbLabelForPageList> keyBinding = new RecordBinding<DbLabelForPageList>() { public DbLabelForPageList createRecordInstance() { return new DbLabelForPageList() ; } } ; return new IntObjectDatabase<DbLabelForPageList>( env, DatabaseType.pageLabel, keyBinding ) { @Override public WEntry<Integer,DbLabelForPageList> deserialiseCsvRecord(CsvRecordInput record) throws IOException { Integer id = record.readInt(null) ; DbLabelForPageList labels = new DbLabelForPageList() ; labels.deserialize(record) ; return new WEntry<Integer,DbLabelForPageList>(id, labels) ; } @Override public DbLabelForPageList filterCacheEntry(WEntry<Integer,DbLabelForPageList> e, WikipediaConfiguration conf) { TIntHashSet validIds = conf.getArticlesOfInterest() ; if (validIds != null && !validIds.contains(e.getKey())) return null ; return e.getValue(); } } ; } /** * Returns a database associating Integer ids with the ids of articles it links to or that link to it, and the sentence indexes where these links are found. * * @param type either {@link DatabaseType#pageLinksIn} or {@link DatabaseType#pageLinksOut}. * @return a database associating Integer ids with the ids of articles it links to or that link to it, and the sentence indexes where these links are found */ public WDatabase<Integer, DbLinkLocationList> buildPageLinkDatabase(DatabaseType type) { if (type != DatabaseType.pageLinksIn && type != DatabaseType.pageLinksOut) throw new IllegalArgumentException("type must be either DatabaseType.pageLinksIn or DatabaseType.pageLinksOut") ; RecordBinding<DbLinkLocationList> keyBinding = new RecordBinding<DbLinkLocationList>() { public DbLinkLocationList createRecordInstance() { return new DbLinkLocationList() ; } } ; return new IntObjectDatabase<DbLinkLocationList>( env, type, keyBinding ) { @Override public WEntry<Integer, DbLinkLocationList> deserialiseCsvRecord(CsvRecordInput record) throws IOException { Integer id = record.readInt(null) ; DbLinkLocationList l = new DbLinkLocationList() ; l.deserialize(record) ; return new WEntry<Integer, DbLinkLocationList>(id, l) ; } @Override public DbLinkLocationList filterCacheEntry( WEntry<Integer, DbLinkLocationList> e, WikipediaConfiguration conf) { int id = e.getKey() ; DbLinkLocationList links = e.getValue() ; TIntHashSet validIds = conf.getArticlesOfInterest() ; if (validIds != null && !validIds.contains(id)) return null ; ArrayList<DbLinkLocation> newLinks = new ArrayList<DbLinkLocation>() ; for (DbLinkLocation ll:links.getLinkLocations()) { if (validIds != null && !validIds.contains(ll.getLinkId())) continue ; newLinks.add(ll) ; } if (newLinks.size() == 0) return null ; links.setLinkLocations(newLinks) ; return links ; } } ; } /** * Returns a database associating Integer ids with the ids of articles it links to or that link to it. * * @param type either {@link DatabaseType#pageLinksIn} or {@link DatabaseType#pageLinksOut}. * @return a database associating Integer ids with the ids of articles it links to or that link to it. */ public WDatabase<Integer, DbIntList> buildPageLinkNoSentencesDatabase(DatabaseType type) { if (type != DatabaseType.pageLinksInNoSentences && type != DatabaseType.pageLinksOutNoSentences) throw new IllegalArgumentException("type must be either DatabaseType.pageLinksInNoSentences or DatabaseType.pageLinksOutNoSentences") ; RecordBinding<DbIntList> keyBinding = new RecordBinding<DbIntList>() { public DbIntList createRecordInstance() { return new DbIntList() ; } } ; return new IntObjectDatabase<DbIntList>( env, type, keyBinding ) { @Override public WEntry<Integer, DbIntList> deserialiseCsvRecord(CsvRecordInput record) throws IOException { // this has to read from pagelinks file (with sentences Integer id = record.readInt(null) ; DbLinkLocationList l = new DbLinkLocationList() ; l.deserialize(record) ; ArrayList<Integer> linkIds = new ArrayList<Integer>() ; for (DbLinkLocation ll:l.getLinkLocations()) linkIds.add(ll.getLinkId()) ; return new WEntry<Integer, DbIntList>(id, new DbIntList(linkIds)) ; } @Override public DbIntList filterCacheEntry( WEntry<Integer, DbIntList> e, WikipediaConfiguration conf) { int id = e.getKey() ; DbIntList links = e.getValue() ; TIntHashSet validIds = conf.getArticlesOfInterest() ; if (validIds != null && !validIds.contains(id)) return null ; ArrayList<Integer> newLinks = new ArrayList<Integer>() ; for (Integer link:links.getValues()) { if (validIds != null && !validIds.contains(link)) continue ; newLinks.add(link) ; } if (newLinks.size() == 0) return null ; links.setValues(newLinks) ; return links ; } @Override public void loadFromCsvFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException { if (exists() && !overwrite) return ; if (tracker == null) tracker = new ProgressTracker(1, WDatabase.class) ; tracker.startTask(dataFile.length(), "Loading " + getName()) ; BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8")) ; long bytesRead = 0 ; int lineNum = 0 ; Database db = getDatabase(false) ; String line ; while ((line=input.readLine()) != null) { bytesRead = bytesRead + line.length() + 1 ; lineNum++ ; CsvRecordInput cri = new CsvRecordInput(new ByteArrayInputStream((line + "\n").getBytes("UTF-8"))) ; WEntry<Integer,DbIntList> entry = deserialiseCsvRecord(cri) ; if (entry != null) { DatabaseEntry k = new DatabaseEntry() ; keyBinding.objectToEntry(entry.getKey(), k) ; DatabaseEntry v = new DatabaseEntry() ; valueBinding.objectToEntry(entry.getValue(), v) ; db.put(null, k, v) ; } tracker.update(bytesRead) ; } input.close(); env.cleanAndCheckpoint() ; getDatabase(true) ; } } ; } /** * Returns a database appropriate for the given {@link DatabaseType} * * @param type {@link DatabaseType#categoryParents}, {@link DatabaseType#articleParents}, {@link DatabaseType#childCategories},{@link DatabaseType#childArticles}, {@link DatabaseType#redirectSourcesByTarget}, {@link DatabaseType#sentenceSplits} * @return see the description of the appropriate DatabaseType */ public WDatabase<Integer,DbIntList> buildIntIntListDatabase(final DatabaseType type) { switch (type) { case categoryParents: case articleParents: case childCategories: case childArticles: case redirectSourcesByTarget: case sentenceSplits: break ; default: throw new IllegalArgumentException(type.name() + " is not a valid DatabaseType for IntIntListDatabase") ; } RecordBinding<DbIntList> keyBinding = new RecordBinding<DbIntList>() { public DbIntList createRecordInstance() { return new DbIntList() ; } } ; return new IntObjectDatabase<DbIntList>( env, type, keyBinding ) { @Override public WEntry<Integer, DbIntList> deserialiseCsvRecord(CsvRecordInput record) throws IOException { Integer k = record.readInt(null) ; DbIntList v = new DbIntList() ; v.deserialize(record) ; return new WEntry<Integer, DbIntList>(k,v) ; } @Override public DbIntList filterCacheEntry(WEntry<Integer,DbIntList> e, WikipediaConfiguration conf) { int key = e.getKey() ; ArrayList<Integer> values = e.getValue().getValues() ; TIntHashSet validIds = conf.getArticlesOfInterest() ; ArrayList<Integer> newValues = null ; switch (type) { case articleParents : case sentenceSplits : case redirectSourcesByTarget : //only cache if key is valid article if (validIds == null || validIds.contains(key)) newValues = values ; break ; case childArticles : //only cache values that are valid articles newValues = new ArrayList<Integer>() ; for (int v:values) { if (validIds == null || validIds.contains(v)) newValues.add(v) ; } default : //cache everything newValues = values ; } if (newValues == null || newValues.size() == 0) return null ; return new DbIntList(newValues) ; } } ; } /** * Returns a database associating integer id of redirect with the id of its target * * @return a database associating integer id of redirect with the id of its target */ public WDatabase<Integer,Integer> buildRedirectTargetBySourceDatabase() { return new IntObjectDatabase<Integer>( env, DatabaseType.redirectTargetBySource, new IntegerBinding() ) { @Override public WEntry<Integer, Integer> deserialiseCsvRecord( CsvRecordInput record) throws IOException { int k = record.readInt(null) ; int v = record.readInt(null) ; return new WEntry<Integer, Integer>(k,v) ; } @Override public Integer filterCacheEntry( WEntry<Integer, Integer> e, WikipediaConfiguration conf ) { TIntHashSet validIds = conf.getArticlesOfInterest() ; if (validIds != null && !validIds.contains(e.getValue())) return null ; return e.getValue(); } } ; } /** * Returns a database associating integer {@link WEnvironment.StatisticName#ordinal()} with the value relevant to this statistic. * * @return a database associating integer {@link WEnvironment.StatisticName#ordinal()} with the value relevant to this statistic. */ public IntObjectDatabase<Long> buildStatisticsDatabase() { return new IntObjectDatabase<Long>( env, DatabaseType.statistics, new LongBinding() ) { @Override public WEntry<Integer, Long> deserialiseCsvRecord( CsvRecordInput record) throws IOException { String statName = record.readString(null) ; Long v = record.readLong(null) ; Integer k = null; try { k = StatisticName.valueOf(statName).ordinal() ; } catch (Exception e) { Logger.getLogger(WDatabaseFactory.class).warn("Ignoring unknown statistic: " + statName) ; return null ; } return new WEntry<Integer, Long>(k,v) ; } @Override public Long filterCacheEntry( WEntry<Integer, Long> e, WikipediaConfiguration conf ) { return e.getValue() ; } } ; } /** * Returns a database associating integer id of page with DbTranslations (language links) * * @return a database associating integer id of page with DbTranslations (language links) */ public WDatabase<Integer,DbTranslations> buildTranslationsDatabase() { return new IntObjectDatabase<DbTranslations>( env, DatabaseType.translations, new RecordBinding<DbTranslations>() { @Override public DbTranslations createRecordInstance() { return new DbTranslations() ; } } ) { @Override public WEntry<Integer, DbTranslations> deserialiseCsvRecord( CsvRecordInput record) throws IOException { int k = record.readInt(null) ; DbTranslations v = new DbTranslations() ; v.deserialize(record) ; return new WEntry<Integer, DbTranslations>(k,v) ; } @Override public DbTranslations filterCacheEntry( WEntry<Integer, DbTranslations> e, WikipediaConfiguration conf ) { TIntHashSet validIds = conf.getArticlesOfInterest() ; if (validIds != null && !validIds.contains(e.getKey())) return null ; return e.getValue(); } } ; } /** * Returns a database associating integer ids with counts of how many pages it links to or that link to it * * @return a database associating integer ids with counts of how many pages it links to or that link to it */ public PageLinkCountDatabase buildPageLinkCountDatabase() { return new PageLinkCountDatabase(env) ; } /** * Returns a database associating integer id of page with its content, in mediawiki markup format * * @return a database associating integer id of page with its content, in mediawiki markup format */ public WDatabase<Integer,String> buildMarkupDatabase() { return new MarkupDatabase(env) ; } }