package org.wikibrain.loader; import org.apache.commons.cli.*; import org.apache.commons.cli.ParseException; import org.apache.commons.io.FileUtils; import org.wikibrain.conf.Configuration; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.conf.DefaultOptionBuilder; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.cmd.FileMatcher; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.MetaInfoDao; import org.wikibrain.core.dao.UniversalPageDao; import org.wikibrain.core.dao.sql.WpDataSource; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.model.UniversalPage; import org.wikibrain.download.DumpFileDownloader; import org.wikibrain.download.RequestedLinkGetter; import org.wikibrain.mapper.ConceptMapper; import java.io.File; import java.io.IOException; import java.sql.SQLException; import java.util.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * Loads an Iterable of mapped concepts (Universal Pages) into a database. * * @author Ari Weiland * */ public class ConceptLoader { private static final Logger LOG = LoggerFactory.getLogger(ConceptLoader.class); private final LanguageSet languageSet; private final UniversalPageDao dao; private final MetaInfoDao metaDao; public ConceptLoader(LanguageSet languageSet, UniversalPageDao dao, MetaInfoDao metaDao) { this.languageSet = languageSet; this.dao = dao; this.metaDao = metaDao; } public UniversalPageDao getDao() { return dao; } public void load(ConceptMapper mapper) throws ConfigurationException, WikiBrainException { try { LOG.info("Loading Concepts"); Iterator<UniversalPage> pages = mapper.getConceptMap(languageSet); int i = 0; while (pages.hasNext()) { dao.save(pages.next()); i++; if (i%10000 == 0) LOG.info("UniversalPages loaded: " + i); metaDao.incrementRecords(UniversalPage.class); } LOG.info("All UniversalPages loaded: " + i); } catch (DaoException e) { metaDao.incrementErrorsQuietly(UniversalPage.class); throw new WikiBrainException(e); } } public static void downloadWikidataLinks(Configuration conf) throws IOException, WikiBrainException, java.text.ParseException, InterruptedException { List<File> paths = Env.getFiles(Language.WIKIDATA, FileMatcher.WIKIDATA_ITEMS, conf); if (paths.isEmpty()) { File dumpFile = File.createTempFile("wikiapidia", "items"); dumpFile.deleteOnExit(); LOG.info("downloading wikidata items file"); RequestedLinkGetter getter = new RequestedLinkGetter( Language.WIKIDATA, Arrays.asList(FileMatcher.WIKIDATA_ITEMS), new Date() ); FileUtils.writeLines(dumpFile, getter.getLangLinks()); // Fetch the file (if necessary) to the standard path String filePath = conf.get().getString("download.path"); DumpFileDownloader downloader = new DumpFileDownloader(new File(filePath)); downloader.downloadFrom(dumpFile); } } public static void main(String args[]) throws ClassNotFoundException, SQLException, IOException, ConfigurationException, WikiBrainException, DaoException, java.text.ParseException, InterruptedException { Options options = new Options(); options.addOption( new DefaultOptionBuilder() .withLongOpt("drop-tables") .withDescription("drop and recreate all tables") .create("d")); options.addOption( new DefaultOptionBuilder() .hasArg() .withLongOpt("algorithm") .withDescription("algorithm") .create("n")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println( "Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("ConceptLoader", options); return; } Env env = new EnvBuilder(cmd).build(); Configurator conf = env.getConfigurator(); String algorithm = cmd.getOptionValue("n", null); UniversalPageDao dao = conf.get(UniversalPageDao.class, algorithm); MetaInfoDao metaDao = conf.get(MetaInfoDao.class); // TODO: handle checking of purewikidata more robustly if (algorithm == null || algorithm.equals("purewikidata")) { downloadWikidataLinks(env.getConfiguration()); } ConceptMapper mapper = conf.get(ConceptMapper.class, algorithm); final ConceptLoader loader = new ConceptLoader(env.getLanguages(), dao, metaDao); if (cmd.hasOption("d")) { LOG.info("Clearing data"); dao.clear(); } LOG.info("Begin Load"); dao.beginLoad(); loader.load(mapper); LOG.info("End Load"); dao.endLoad(); LOG.info("DONE"); } }