package org.wikibrain.loader;
import gnu.trove.impl.Constants;
import gnu.trove.map.hash.TIntIntHashMap;
import org.apache.commons.cli.*;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.*;
import org.wikibrain.core.dao.sql.WpDataSource;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageInfo;
import org.wikibrain.core.model.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* Idea for changing the flow of parsing:
* - First load all redirect page id -> page id into memory (TIntIntHashMap).
* - Fix chaining redirects
* - Then save.
* - RedirectSqlDao.update goes away.
*/
public class RedirectLoader {
private static final Logger LOG = LoggerFactory.getLogger(RedirectLoader.class);
private final MetaInfoDao metaDao;
private TIntIntHashMap redirectIdsToPageIds;
private final RawPageDao rawPages;
private final LocalPageDao localPages;
private final RedirectDao redirects;
public RedirectLoader(RawPageDao rpdao, LocalPageDao lpdao, RedirectDao rdao, MetaInfoDao metaDao) throws DaoException{
this.rawPages = rpdao;
this.localPages = lpdao;
lpdao.setFollowRedirects(false);
this.redirects = rdao;
this.metaDao = metaDao;
}
public RedirectDao getDao() {
return redirects;
}
private void loadRedirectIdsIntoMemory(Language language) throws DaoException{
redirectIdsToPageIds = new TIntIntHashMap(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1, -1);
int i = 0;
LOG.info("Begin loading redirects into memory: ");
for (RawPage p : rawPages.get(new DaoFilter().setLanguages(language).setRedirect(true))) {
Title pTitle = new Title(p.getRedirectTitle(), LanguageInfo.getByLanguage(language));
redirectIdsToPageIds.put(p.getLocalId(),
localPages.getIdByTitle(pTitle.getCanonicalTitle(), language, pTitle.getNamespace()));
if(i%100000==0)
LOG.info("loading redirect # " + i);
i++;
}
LOG.info("End loading redirects into memory.");
}
private int resolveRedirect(int src){
int dest = redirectIdsToPageIds.get(src);
for(int i = 0; i<4; i++){
if (redirectIdsToPageIds.get(dest) == -1)
return dest;
dest = redirectIdsToPageIds.get(dest);
}
return -1;
}
private void resolveRedirectsInMemory(){
int i = 0;
for (int src : redirectIdsToPageIds.keys()) {
redirectIdsToPageIds.put(src, resolveRedirect(src));
if(i%10000==0)
LOG.info("resolving redirect # " + i);
i++;
}
}
private void loadRedirectsIntoDatabase(Language language) throws DaoException{
int i = 0;
LOG.info("Begin loading redirects into database: ");
for(int src : redirectIdsToPageIds.keys()){
if(i%10000==0)
LOG.info("loaded " + i + " into database.");
redirects.save(language, src, redirectIdsToPageIds.get(src));
metaDao.incrementRecords(Redirect.class, language);
i++;
}
LOG.info("End loading redirects into database.");
}
public static void main(String args[]) throws ConfigurationException, DaoException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("drop-tables")
.withDescription("drop and recreate all tables")
.create("d"));
EnvBuilder.addStandardOptions(options);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.err.println( "Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("DumpLoader", options);
return;
}
Env env = new EnvBuilder(cmd).build();
Configurator conf = env.getConfigurator();
MetaInfoDao metaDao = conf.get(MetaInfoDao.class);
LocalPageDao pageDao = conf.get(LocalPageDao.class);
RedirectLoader redirectLoader = new RedirectLoader(
conf.get(RawPageDao.class),
pageDao,
conf.get(RedirectDao.class),
metaDao
);
if (cmd.hasOption("d")){
LOG.info("Clearing data provider: ");
redirectLoader.getDao().clear();
metaDao.clear(Redirect.class);
}
LOG.info("Begin Load: ");
redirectLoader.getDao().beginLoad();
metaDao.beginLoad();
for(Language l : env.getLanguages()){
LOG.info("LOADING REDIRECTS FOR " + l);
redirectLoader.loadRedirectIdsIntoMemory(l);
redirectLoader.resolveRedirectsInMemory();
redirectLoader.loadRedirectsIntoDatabase(l);
}
redirectLoader.getDao().endLoad();
metaDao.endLoad();
LOG.info("triggering page title cache creation...");
pageDao.setFollowRedirects(true);
LocalPage page = pageDao.getByTitle(env.getDefaultLanguage(), NameSpace.ARTICLE, "FooBar");
}
}