package org.wikibrain.sr.wikify;
import org.apache.commons.cli.*;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.DaoFilter;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.model.RawPage;
import org.wikibrain.phrases.LinkProbabilityDao;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author Shilad Sen
*/
public class WikiTextCorpusCreator extends BaseCorpusCreator{
private static final Logger LOG = LoggerFactory.getLogger(WikiTextCorpusCreator.class);
private final Language language;
private final RawPageDao dao;
private int maxPages = Integer.MAX_VALUE;
public WikiTextCorpusCreator(Language language, Wikifier wikifier, RawPageDao dao, LocalPageDao lpd, LinkProbabilityDao probabilityDao) {
super(language, lpd, wikifier, probabilityDao);
this.language = language;
this.dao = dao;
}
public void setMaxPages(int maxPages) {
this.maxPages = maxPages;
}
@Override
public Iterator<IdAndText> getCorpus() throws DaoException {
DaoFilter filter = new DaoFilter()
.setRedirect(false)
.setDisambig(false)
.setLanguages(language)
.setLimit(maxPages);
Iterator<RawPage> iter = dao.get(filter).iterator();
return new RawPageTextIterator(iter);
}
public static class RawPageTextIterator implements Iterator<IdAndText> {
private final Iterator<RawPage> iter;
private static IdAndText buffer = null;
public RawPageTextIterator(Iterator<RawPage> iter) {
this.iter = iter;
this.fillBuffer();
}
@Override
public boolean hasNext() {
return (buffer != null);
}
@Override
public IdAndText next() {
IdAndText result = buffer;
if (buffer != null) {
buffer = null;
fillBuffer();
}
return result;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private void fillBuffer() {
while (buffer == null && iter.hasNext()) {
RawPage rp = iter.next();
if (rp != null) {
try {
String text = rp.getPlainText(false);
if (text != null && text.trim().length() > 0) {
buffer = new IdAndText(rp.getLocalId(), text.trim());
}
} catch (Exception e) {
LOG.warn("Error when extracting text from: " + rp.getTitle());
}
}
}
}
}
public static void main(String args[]) throws ConfigurationException, IOException, DaoException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.isRequired()
.withLongOpt("output")
.withDescription("corpus output directory (existing data will be lost)")
.create("o"));
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.withLongOpt("max-articles")
.withDescription("Maximum number of articles to process")
.create("x"));
EnvBuilder.addStandardOptions(options);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.err.println( "Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("WikiTextCorpusCreator", options);
return;
}
Env env = new EnvBuilder(cmd).build();
RawPageDao rpd = env.getConfigurator().get(RawPageDao.class);
LocalPageDao lpd = env.getConfigurator().get(LocalPageDao.class);
Language lang = env.getLanguages().getDefaultLanguage();
LinkProbabilityDao linkProbabilityDao =env.getComponent(LinkProbabilityDao.class, lang);
Wikifier wikifier = env.getComponent(Wikifier.class, lang);
WikiTextCorpusCreator creator = new WikiTextCorpusCreator(lang, wikifier, rpd, lpd, linkProbabilityDao);
if (cmd.hasOption("x")) {
creator.setMaxPages(Integer.valueOf(cmd.getOptionValue("x")));
}
File output = new File(cmd.getOptionValue("o"));
creator.write(output);
}
}