package org.wikibrain.loader; import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.conf.DefaultOptionBuilder; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.DaoFilter; import org.wikibrain.core.dao.MetaInfoDao; import org.wikibrain.core.dao.RawPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.model.RawPage; import org.wikibrain.lucene.LuceneIndexer; import org.wikibrain.lucene.LuceneOptions; import org.wikibrain.lucene.LuceneSearcher; import org.wikibrain.utils.WpThreadUtils; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * This loader indexes raw pages into the lucene index. * It should not be called sooner than the WikiTextLoader, * but where after that I am not sure. * * @author Ari Weiland * */ public class LuceneLoader { private static final Logger LOG = LoggerFactory.getLogger(LuceneLoader.class); private static final RawPage POISON_PILL = new RawPage(0, 0, "", null, null, Language.getByLangCode("en"), null); // maximum number of raw pages in the parsing buffer public static final int MAX_QUEUE = 1000; private final RawPageDao rawPageDao; private final Collection<NameSpace> namespaces; private final BlockingQueue<RawPage> queue = new ArrayBlockingQueue<RawPage>(MAX_QUEUE); private final List<Thread> workers = new ArrayList<Thread>(); private final MetaInfoDao metaDao; private final LuceneOptions[] luceneOptions; private LuceneIndexer luceneIndexer; public LuceneLoader(RawPageDao rawPageDao, MetaInfoDao metaDao, LuceneOptions[] luceneOptions, Collection<NameSpace> namespaces) { this.rawPageDao = rawPageDao; this.metaDao = metaDao; this.luceneOptions = luceneOptions; this.namespaces = namespaces; } /** * NOTE: only one language can be loaded at a time. * @param language * @throws WikiBrainException */ public synchronized void load(Language language) throws WikiBrainException, ConfigurationException { try { createWorkers(); DaoFilter filter = new DaoFilter() .setLanguages(language) .setNameSpaces(namespaces) .setRedirect(false); int n = rawPageDao.getCount(filter); int i = 0; luceneIndexer = new LuceneIndexer(language, luceneOptions); for (RawPage rawPage : rawPageDao.get(filter)) { queue.put(rawPage); if (++i % 1000 == 0) { LOG.info("RawPages indexed " + language + ": " + i + " of " + n); } } queue.put(POISON_PILL); } catch (DaoException e) { throw new WikiBrainException(e); } catch (InterruptedException e) { throw new WikiBrainException(e); } finally { cleanupWorkers(); queue.clear(); if (luceneIndexer != null) { IOUtils.closeQuietly(luceneIndexer); luceneIndexer = null; } } } public void endLoad() { if (luceneIndexer != null) { luceneIndexer.close(); } } private void createWorkers() { workers.clear(); for (int i = 0; i < WpThreadUtils.getMaxThreads(); i++) { Thread t = new Thread(new Worker()); t.start(); workers.add(t); } } private void cleanupWorkers() { long maxMillis = System.currentTimeMillis() + 2 * 60 * 1000; for (Thread w : workers) { try { w.join(Math.max(0, maxMillis - System.currentTimeMillis())); } catch (InterruptedException e) { LOG.info("ignoring interrupted exception on thread join", e); } } for (Thread w : workers) { w.interrupt(); } workers.clear(); } private class Worker implements Runnable { public Worker() { } @Override public void run() { boolean finished = false; while (!finished) { RawPage rp = null; Language lang = null; try { rp = queue.poll(100, TimeUnit.MILLISECONDS); if (rp == POISON_PILL) { queue.put(rp); finished = true; } else if (rp != null) { lang = rp.getLanguage(); luceneIndexer.indexPage(rp); metaDao.incrementRecords(LuceneSearcher.class, lang); } } catch (InterruptedException e) { LOG.warn("LuceneLoader.Worker received interrupt."); return; } catch (Exception e) { metaDao.incrementErrorsQuietly(LuceneSearcher.class, lang); String title = "unknown"; if (rp != null) title = rp.getTitle().toString(); LOG.warn("exception while parsing " + title, e); } } } } public static void main(String args[]) throws ConfigurationException, WikiBrainException, IOException, DaoException { Options options = new Options(); options.addOption( new DefaultOptionBuilder() .withLongOpt("drop-indexes") .withDescription("drop and recreate all indexes") .create("d")); options.addOption( new DefaultOptionBuilder() .hasArgs() .withValueSeparator(',') .withLongOpt("namespaces") .withDescription("the set of namespaces to index, separated by commas") .create("p")); options.addOption( new DefaultOptionBuilder() .hasArgs() .withValueSeparator(',') .withLongOpt("indexes") .withDescription("the types of indexes to store, separated by commas") .create("i")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println("Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("LuceneLoader", options); return; } Env env = new EnvBuilder(cmd).build(); Configurator conf = env.getConfigurator(); LuceneOptions[] luceneOptions; if (cmd.hasOption("i")) { String[] optionType = cmd.getOptionValues("i"); luceneOptions = new LuceneOptions[optionType.length]; for (int i=0; i<optionType.length; i++) { luceneOptions[i] = conf.get(LuceneOptions.class, optionType[i]); } } else { luceneOptions = new LuceneOptions[] { conf.get(LuceneOptions.class, "plaintext"), conf.get(LuceneOptions.class, "esa") }; } LanguageSet languages = env.getLanguages(); Collection<NameSpace> namespaces = new ArrayList<NameSpace>(); if (cmd.hasOption("p")) { String[] nsStrings = cmd.getOptionValues("p"); for (String s : nsStrings) { namespaces.add(NameSpace.getNameSpaceByName(s)); } } else { namespaces = luceneOptions[0].namespaces; } RawPageDao rawPageDao = conf.get(RawPageDao.class); MetaInfoDao metaDao = conf.get(MetaInfoDao.class); metaDao.beginLoad(); for (Language lang : languages) { metaDao.clear(LuceneSearcher.class, lang); } final LuceneLoader loader = new LuceneLoader(rawPageDao, metaDao, luceneOptions, namespaces); LOG.info("Begin indexing"); for (Language lang : languages) { loader.load(lang); } loader.endLoad(); metaDao.endLoad(); LOG.info("Done indexing"); } }