package org.wikibrain.sr.wikify; import gnu.trove.TCollections; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.TIntObjectMap; import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.model.LocalLink; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.nlp.Dictionary; import org.wikibrain.core.nlp.StringTokenizer; import org.wikibrain.core.nlp.Token; import org.wikibrain.phrases.LinkProbabilityDao; import org.wikibrain.phrases.PhraseTokenizer; import org.wikibrain.utils.ParallelForEach; import org.wikibrain.utils.Procedure; import org.wikibrain.utils.WpIOUtils; import org.wikibrain.utils.WpThreadUtils; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.util.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author Shilad Sen */ public abstract class BaseCorpusCreator { private static final Logger LOG = LoggerFactory.getLogger(BaseCorpusCreator.class); private final Language language; private final StringTokenizer tokenizer = new StringTokenizer(); private final Wikifier wikifier; private final LocalPageDao pageDao; private Dictionary dictionary; private BufferedWriter corpus; private TIntObjectMap<String> mentionUrls = TCollections.synchronizedMap(new TIntObjectHashMap<String>()); private boolean joinPhrases = true; private final PhraseTokenizer phraseTokenizer; public BaseCorpusCreator(Language language, LocalPageDao pageDao, Wikifier wikifier, LinkProbabilityDao linkProbDao) { this.language = language; this.pageDao = pageDao; this.wikifier = wikifier; this.phraseTokenizer = new PhraseTokenizer(linkProbDao); } /** * @return A list of Strings in the corpus. * Each string should be at least sentence granularity. * They could be a higher level (paragraph, document). */ public abstract Iterator<IdAndText> getCorpus() throws DaoException; public void write(File dir) throws IOException, DaoException { if (dir.exists()) { FileUtils.deleteQuietly(dir); } dir.mkdirs(); dictionary = new Dictionary(language, Dictionary.WordStorage.ON_DISK); corpus = WpIOUtils.openWriter(new File(dir, "corpus.txt")); corpus.write(String.format("@WikiBrainCorpus\t%s\t%s\t%s\t%s\n", this.language.getLangCode(), this.getClass().getName(), wikifier.getClass().getName(), new Date().toString() )); ParallelForEach.iterate(getCorpus(), new Procedure<IdAndText>() { @Override public void call(IdAndText text) throws Exception { processText(text); } }, 10000); corpus.close(); dictionary.write(new File(dir, "dictionary.txt")); } private void processText(IdAndText text) throws IOException, DaoException { List<LocalLink> mentions; if (text.getId() >= 0) { mentions = wikifier.wikify(text.getId(), text.getText()); } else { mentions = wikifier.wikify(text.getText()); } LocalPage page = pageDao.getById(language, text.getId()); String title = (page == null) ? "Unknown" : page.getTitle().getCanonicalTitle(); StringBuilder document = new StringBuilder(); document.append("\n@WikiBrainDoc\t" + text.getId() + "\t" + title + "\n"); for (Token sentence : tokenizer.getSentenceTokens(language, text.getText())) { List<String> tokens = addMentions(sentence, mentions); if (tokens == null) { continue; } String finalSentence = joinPhrases(tokens); document.append(finalSentence); document.append('\n'); dictionary.countNormalizedText(finalSentence); } synchronized (corpus) { corpus.write(document.toString() + "\n"); } } private String joinPhrases(List<String> words) throws DaoException { if (words.isEmpty()) { return null; } StringBuilder buffer = new StringBuilder(); for (String phrase : phraseTokenizer.makePhrases(language, words)) { if (buffer.length() > 0) buffer.append(' '); buffer.append(phrase.replaceAll(" ", "_")); } return buffer.toString(); } private List<String> addMentions(Token sentence, List<LocalLink> mentions) throws IOException, DaoException { List<Token> words = tokenizer.getWordTokens(language, sentence); if (words.isEmpty()) { return null; } // Accumulators List<String> line = new ArrayList<String>(); // Process each word token // Warning: If mentions do not align with sentence tokens, this will break... for (int m = 0, w = 0; w < words.size(); w++) { Token token = words.get(w); // Advance mention while it starts before the current token while (m < mentions.size() && mentions.get(m).getLocation() < token.getBegin()) { m++; } String phrase = token.getToken(); // If start of mention occurs in token, advance tokens as necessary if (m < mentions.size() && mentions.get(m).getLocation() < token.getEnd()) { int end = mentions.get(m).getLocation() + mentions.get(m).getAnchorText().length(); // While next word begins before mention ends, append next word while (w+1 < words.size() && words.get(w+1).getBegin() < end) { if (phrase.length() > 0) { phrase += "_"; } w++; phrase += words.get(w).getToken(); } phrase += ":" + getMentionUrl(mentions.get(m).getDestId()); } phrase = phrase.trim(); if (phrase.length() == 0) { continue; } if (phrase.contains("\n")) { throw new IllegalStateException(); } line.add(phrase); } return line; } private String getMentionUrl(int wpId) throws DaoException { if (!mentionUrls.containsKey(wpId)) { LocalPage page = pageDao.getById(language, wpId); if (page == null) { mentionUrls.put(wpId, "/w/" + language.getLangCode() + "/-1/Unknown_page"); } else { mentionUrls.put(wpId, page.getCompactUrl()); } } return mentionUrls.get(wpId); } public void setJoinPhrases(boolean joinPhrases) { this.joinPhrases = joinPhrases; } }