package org.wikibrain.sr.wikify; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.DefaultOptionBuilder; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.DaoFilter; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.dao.RawPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.model.RawPage; import org.wikibrain.phrases.LinkProbabilityDao; import org.wikibrain.utils.WpIOUtils; import java.io.File; import java.io.IOException; import java.util.Iterator; /** * @author Shilad Sen */ public class PlainTextCorpusCreator extends BaseCorpusCreator{ private static final Logger LOG = LoggerFactory.getLogger(PlainTextCorpusCreator.class); private final File file; private int maxPages = Integer.MAX_VALUE; public PlainTextCorpusCreator(Language language, Wikifier wikifier, LocalPageDao lpd, LinkProbabilityDao probabilityDao, File inputFile) { super(language, lpd, wikifier, probabilityDao); this.file = inputFile; if (!file.isFile()) { throw new IllegalArgumentException("Plaintext corpus " + file + " does not exist"); } } public void setMaxPages(int maxPages) { this.maxPages = maxPages; } @Override public Iterator<IdAndText> getCorpus() throws DaoException { try { return new ClosingLineIterator( IOUtils.lineIterator( WpIOUtils.openReader(file))); } catch (IOException e) { throw new DaoException(e); } } public static class ClosingLineIterator implements Iterator<IdAndText> { private LineIterator iter; public ClosingLineIterator(LineIterator iter) { this.iter = iter; } @Override public boolean hasNext() { LineIterator i = iter; if (i == null) { return false; } else if (i.hasNext()) { return true; } else { i.close(); iter = null; return false; } } @Override public IdAndText next() { return new IdAndText(-1, iter.next()); } @Override public void remove() { throw new UnsupportedOperationException(); } } public static void main(String args[]) throws ConfigurationException, IOException, DaoException { Options options = new Options(); options.addOption( new DefaultOptionBuilder() .hasArg() .isRequired() .withLongOpt("input") .withDescription("input output file (existing data will be lost)") .create("i")); options.addOption( new DefaultOptionBuilder() .hasArg() .isRequired() .withLongOpt("output") .withDescription("corpus output directory (existing data will be lost)") .create("o")); options.addOption( new DefaultOptionBuilder() .hasArg() .withLongOpt("max-articles") .withDescription("Maximum number of articles to process") .create("x")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println( "Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("WikiTextCorpusCreator", options); return; } Env env = new EnvBuilder(cmd).build(); RawPageDao rpd = env.getConfigurator().get(RawPageDao.class); LocalPageDao lpd = env.getConfigurator().get(LocalPageDao.class); Language lang = env.getLanguages().getDefaultLanguage(); Wikifier wikifier = env.getComponent(Wikifier.class, lang); LinkProbabilityDao linkProbabilityDao = env.getComponent(LinkProbabilityDao.class, lang); PlainTextCorpusCreator creator = new PlainTextCorpusCreator( lang, wikifier, lpd, linkProbabilityDao, new File(cmd.getOptionValue("i"))); if (cmd.hasOption("x")) { creator.setMaxPages(Integer.valueOf(cmd.getOptionValue("x"))); } File output = new File(cmd.getOptionValue("o")); creator.write(output); } }