package org.wikibrain.sr.wikify;
import com.typesafe.config.Config;
import org.apache.commons.io.FileUtils;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.phrases.AnchorTextPhraseAnalyzer;
import org.wikibrain.phrases.LinkProbabilityDao;
import org.wikibrain.phrases.PhraseAnalyzer;
import org.wikibrain.phrases.PhraseAnalyzerDao;
import org.wikibrain.sr.word2vec.Word2Phrase;
import java.io.File;
import java.io.IOException;
import java.util.Map;
/**
* @author Shilad Sen
*/
public class Corpus {
private final Language language;
private final File directory;
private final Wikifier wikifer;
private final RawPageDao rawPageDao;
private final LocalPageDao localPageDao;
private final PhraseAnalyzerDao phraseAnalyzerDao;
private final LinkProbabilityDao linkProbabilityDao;
public Corpus(Language language, File directory, Wikifier wikifer, RawPageDao rawPageDao, LocalPageDao localPageDao, PhraseAnalyzerDao phraseAnalyzerDao, LinkProbabilityDao linkProbabilityDao) {
this.language = language;
this.directory = directory;
this.wikifer = wikifer;
this.rawPageDao = rawPageDao;
this.localPageDao = localPageDao;
this.phraseAnalyzerDao = phraseAnalyzerDao;
this.linkProbabilityDao = linkProbabilityDao;
}
public File getDirectory() {
return directory;
}
public void create() throws IOException, DaoException {
if (!this.linkProbabilityDao.isBuilt()) {
this.linkProbabilityDao.build();
}
this.linkProbabilityDao.useCache(true);
FileUtils.deleteQuietly(directory);
directory.mkdirs();
WikiTextCorpusCreator creator = new WikiTextCorpusCreator(language, wikifer, rawPageDao, localPageDao, linkProbabilityDao);
creator.write(directory);
}
public File getCorpusFile() {
return new File(directory, "corpus.txt");
}
public File getDictionaryFile() {
return new File(directory, "dictionary.txt");
}
public Language getLanguage() { return this.language; };
public boolean exists() {
return getCorpusFile().isFile() && getDictionaryFile().isFile();
}
public static class Provider extends org.wikibrain.conf.Provider<Corpus> {
public Provider(Configurator configurator, Configuration config) throws ConfigurationException {
super(configurator, config);
}
@Override
public Class<Corpus> getType() {
return Corpus.class;
}
@Override
public String getPath() {
return "sr.corpus";
}
@Override
public Corpus get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException {
if (runtimeParams == null || !runtimeParams.containsKey("language")) {
throw new IllegalArgumentException("Corpus requires 'language' runtime parameter");
}
Language lang = Language.getByLangCode(runtimeParams.get("language"));
String wikifierName = config.hasPath("wikifier") ? config.getString("wikifier") : "default";
Configurator c = getConfigurator();
Wikifier wikifier = Env.getComponent(c, Wikifier.class, wikifierName, lang);
AnchorTextPhraseAnalyzer phraseAnalyzer = (AnchorTextPhraseAnalyzer)c.get(
PhraseAnalyzer.class, config.getString("phraseAnalyzer"));
PhraseAnalyzerDao paDao = phraseAnalyzer.getDao();
LinkProbabilityDao linkProbabilityDao = Env.getComponent(c, LinkProbabilityDao.class, lang);
return new Corpus(
lang,
new File(config.getString("path"), lang.getLangCode()),
wikifier,
c.get(RawPageDao.class, config.getString("rawPageDao")),
c.get(LocalPageDao.class, config.getString("localPageDao")),
paDao,
linkProbabilityDao
);
}
}
}