package org.wikibrain.sr.word2vec; import gnu.trove.map.TIntIntMap; import gnu.trove.map.TIntObjectMap; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.DefaultOptionBuilder; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.dao.RawPageDao; import org.wikibrain.core.dao.UniversalPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.nlp.Dictionary; import org.wikibrain.download.FileDownloader; import org.wikibrain.phrases.LinkProbabilityDao; import org.wikibrain.sr.SRBuilder; import org.wikibrain.sr.wikify.*; import org.wikibrain.utils.WpIOUtils; import java.io.BufferedWriter; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.regex.Matcher; /** * @author Shilad Sen */ public class CorpusCreatorMain { private static final Logger LOG = LoggerFactory.getLogger(CorpusCreatorMain.class); private static final int OPTIMAL_FILE_SIZE = 50 * 1024 * 1024; private final Language lang; private final Env env; private final LocalPageDao pageDao; private final TIntObjectMap<String> shortUrls = new TIntObjectHashMap<String>(); private static final String[][] CORPORA = { { "simple", "http://shilad.com/news.2007.en.shuffled.gz"}, // A Smallish file for testing. { "cs", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.cs.shuffled.gz"}, { "de", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.de.shuffled.gz"}, { "en", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz"}, { "es", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.es.shuffled.gz"}, { "fr", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.fr.shuffled.gz"}, { "hi", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.hi.shuffled.gz"}, { "ru", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.ru.shuffled.gz"}, { "cs", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.cs.shuffled.gz"}, { "de", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.de.shuffled.gz"}, { "en", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz"}, { "es", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.es.shuffled.gz"}, { "fr", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.fr.shuffled.gz"}, { "hi", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.hi.shuffled.gz"}, { "ru", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.ru.shuffled.gz"}, }; public CorpusCreatorMain(Env env, Language lang) throws ConfigurationException, DaoException { this.env = env; this.lang = lang; this.pageDao = env.getComponent(LocalPageDao.class); } public void create(String path) throws ConfigurationException, DaoException, WikiBrainException, IOException, InterruptedException { SRBuilder builder = new SRBuilder(env, "prebuiltword2vec", lang); builder.setSkipBuiltMetrics(true); builder.setCreateFakeGoldStandard(true); builder.build(); FileUtils.deleteQuietly(new File(path)); FileUtils.forceMkdir(new File(path)); Corpus c = env.getConfigurator().get(Corpus.class, "wikified", "language", lang.getLangCode()); if (c == null) throw new IllegalStateException("Couldn't find wikified corpus for language " + lang); if (!c.exists()) { c.create(); } RotatingWriter writer = new RotatingWriter( path + "/corpus." + lang.getLangCode() + ".", ".txt", OPTIMAL_FILE_SIZE); RawPageDao rawDao = env.getConfigurator().get(RawPageDao.class); LocalPageDao pageDao = env.getConfigurator().get(LocalPageDao.class); Wikifier wikifier = env.getComponent(Wikifier.class, "websail-final", lang); ((WebSailWikifier)wikifier).setMinFinalScore(0.00001); ((WebSailWikifier)wikifier).setDesiredLinkRecall(0.99); LinkProbabilityDao linkDao = env.getComponent(LinkProbabilityDao.class, lang); // Process the wikipedia corpus File tmp = WpIOUtils.createTempDirectory(lang.getLangCode() + "corpora"); File in = new File(tmp, "wikipedia"); WikiTextCorpusCreator wtc = new WikiTextCorpusCreator(lang, wikifier, rawDao, pageDao, linkDao); wtc.write(in); FileUtils.forceDeleteOnExit(in); WbCorpusLineReader cr = new WbCorpusLineReader(new File(in, "corpus.txt")); for (WbCorpusLineReader.Line line : cr) { processLine(writer, line.getLine(), line.getDocId(), line.getLineNumber(), line.getCharNumber()); } FileUtils.deleteQuietly(in); // Process the online corpora for (String [] info : CORPORA) { if (info[0].equals(lang.getLangCode())) { URL url = new URL(info[1]); String name = new File(url.getFile()).getName(); FileDownloader downloader = new FileDownloader(); in = downloader.download(url, new File(tmp, name)); in.deleteOnExit(); File out = new File(in.toString().replace(".gz", "") + ".wikified"); PlainTextCorpusCreator ptc = new PlainTextCorpusCreator(lang, wikifier, pageDao, linkDao, in); ptc.write(out); WbCorpusLineReader r = new WbCorpusLineReader(new File(out, "corpus.txt")); for (WbCorpusLineReader.Line line : r) { processLine(writer, line.getLine(), -1, -1, -1); } } } FileUtils.deleteQuietly(tmp); writer.close(); } protected String getShortUrl(int wpId) throws IOException { if (wpId < 0) return null; synchronized (shortUrls) { if (shortUrls.containsKey(wpId)) { String url = shortUrls.get(wpId); return url.isEmpty() ? null: url; } } LocalPage page = null; try { page = pageDao.getById(lang, wpId); } catch (DaoException e) { throw new IOException(e); } synchronized (shortUrls) { if (page == null) { shortUrls.put(wpId, ""); return null; } else { String url = page.getCompactUrl(); shortUrls.put(wpId, url); return url; } } } private void processLine(RotatingWriter writer, String line, int pageId, int lineNum, int charNum) throws IOException { List<String> words = new ArrayList<String>(); for (String word : line.split(" +")) { int mentionStart = word.indexOf(":/w/"); if (mentionStart >= 0) { Matcher m = Dictionary.PATTERN_MENTION.matcher(word.substring(mentionStart)); if (m.matches()) { List<String> parts = new ArrayList<String>(); parts.add(makeWordToken(word.substring(0, mentionStart))); int wpId2 = Integer.valueOf(m.group(3)); if (wpId2 >= 0) { parts.add(word.substring(mentionStart + 1)); } words.addAll(parts); } else { words.add(makeWordToken(word)); } } else { words.add(makeWordToken(word)); } } List<String> labels = new ArrayList<String>(); String url = getShortUrl(pageId); if (url != null) labels.add(url); writer.write(lineNum + "\t" + charNum + "\t"); writer.write(StringUtils.join(labels, " ") + "\t"); writer.write(StringUtils.join(words, " ") + "\n"); } private String makeWordToken(String word) { return word; } static class RotatingWriter implements Closeable { private final String prefix; private final String suffix; private final int maxBytes; private int fileNum = 0; private int numBytes = 0; private BufferedWriter writer = null; RotatingWriter(String prefix, String suffix, int maxBytes) { this.prefix = prefix; this.suffix = suffix; this.maxBytes = maxBytes; } void write(String text) throws IOException { possiblyRotateWriter(); numBytes += text.getBytes("UTF-8").length; writer.write(text); } private void possiblyRotateWriter() throws IOException { if (writer == null || numBytes >= maxBytes) { if (writer != null) { close(); fileNum++; numBytes = 0; } writer = WpIOUtils.openWriter(String.format("%s%05d%s", prefix, fileNum, suffix)); } } @Override public void close() throws IOException { if (writer != null) { IOUtils.closeQuietly(writer); writer = null; } } } public static void main(String args[]) throws ConfigurationException, DaoException, IOException, WikiBrainException, InterruptedException { Options options = new Options(); options.addOption( new DefaultOptionBuilder() .hasArg() .isRequired() .withLongOpt("output") .withDescription("corpus output directory (existing data will be lost)") .create("o")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println( "Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("UniversalWord2VecMain", options); return; } Env env = new EnvBuilder(cmd).build(); for (Language l : env.getLanguages()) { try { LOG.info("Generating corpus for language " + l); CorpusCreatorMain creator = new CorpusCreatorMain(env, l); String path = cmd.getOptionValue("o") + "/" + l.getLangCode(); File file = new File(path); if (!file.isDirectory() || file.list().length == 0) { creator.create(path); } } catch (Exception e) { LOG.warn("Generation of corpus for language " + l + " failed", e); } } } }