/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.en.GoogleStyleWordTokenizer;
import org.languagetool.tokenizers.SentenceTokenizer;
import org.languagetool.tokenizers.Tokenizer;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
/**
* Prepare indexing the CommonCrawl-based data from http://data.statmt.org/ngrams/
* to ngrams - result will still need to be aggregated and then indexed
* with {@link AggregatedNgramToLucene}.
* @since 3.2
*/
class CommonCrawlToNgram3 implements AutoCloseable {
private static final int MAX_TOKEN_LENGTH = 20;
private static final int MAX_SENTENCE_LENGTH = 50_000;
private static final int CACHE_LIMIT = 1_000_000; // max. number of trigrams in HashMap before we flush to Lucene
private final File input;
private final SentenceTokenizer sentenceTokenizer;
private final Tokenizer wordTokenizer;
private final Map<String, Long> unigramToCount = new HashMap<>();
private final Map<String, Long> bigramToCount = new HashMap<>();
private final Map<String, Long> trigramToCount = new HashMap<>();
private final Map<Integer, FileWriter> ngramSizeToWriter = new HashMap<>();
private long charCount = 0;
private long lineCount = 0;
CommonCrawlToNgram3(Language language, File input, File outputDir) throws IOException {
this.input = input;
this.sentenceTokenizer = language.getSentenceTokenizer();
this.wordTokenizer = new GoogleStyleWordTokenizer();
ngramSizeToWriter.put(1, new FileWriter(new File(outputDir, "unigrams.csv")));
ngramSizeToWriter.put(2, new FileWriter(new File(outputDir, "bigrams.csv")));
ngramSizeToWriter.put(3, new FileWriter(new File(outputDir, "trigrams.csv")));
}
@Override
public void close() throws Exception {
for (Map.Entry<Integer, FileWriter> entry : ngramSizeToWriter.entrySet()) {
entry.getValue().close();
}
}
private void indexInputFile() throws IOException, CompressorException {
FileInputStream fin = new FileInputStream(input);
BufferedInputStream in = new BufferedInputStream(fin);
try (CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(in)) {
final byte[] buffer = new byte[8192];
int n;
while ((n = input.read(buffer)) != -1) {
String buf = new String(buffer, 0, n); // TODO: not always correct, we need to wait for line end first?
String[] lines = buf.split("\n");
indexLine(lines);
}
}
writeToDisk(1, unigramToCount);
writeToDisk(2, bigramToCount);
writeToDisk(3, trigramToCount);
}
private void indexLine(String[] lines) throws IOException {
for (String line : lines) {
if (line.length() > MAX_SENTENCE_LENGTH) {
System.out.println("Ignoring long line: " + line.length() + " bytes");
continue;
}
if (lineCount++ % 50_000 == 0) {
float mb = (float) charCount / 1000 / 1000;
System.out.printf(Locale.ENGLISH, "Indexing line %d (%.2fMB)\n", lineCount, mb);
}
charCount += line.length();
List<String> sentences = sentenceTokenizer.tokenize(line);
for (String sentence : sentences) {
indexSentence(sentence);
}
}
}
private void indexSentence(String sentence) throws IOException {
List<String> tokens = wordTokenizer.tokenize(sentence);
tokens.add(0, LanguageModel.GOOGLE_SENTENCE_START);
tokens.add(LanguageModel.GOOGLE_SENTENCE_END);
String prevPrev = null;
String prev = null;
for (String token : tokens) {
if (token.trim().isEmpty()) {
continue;
}
if (token.length() <= MAX_TOKEN_LENGTH) {
unigramToCount.compute(token, (k, v) -> v == null ? 1 : v + 1);
}
if (prev != null) {
if (token.length() <= MAX_TOKEN_LENGTH && prev.length() <= MAX_TOKEN_LENGTH) {
String ngram = prev + " " + token;
bigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
}
}
if (prevPrev != null && prev != null) {
if (token.length() <= MAX_TOKEN_LENGTH && prev.length() <= MAX_TOKEN_LENGTH && prevPrev.length() <= MAX_TOKEN_LENGTH) {
String ngram = prevPrev + " " + prev + " " + token;
trigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
}
if (unigramToCount.size() > CACHE_LIMIT) {
writeToDisk(1, unigramToCount);
}
if (bigramToCount.size() > CACHE_LIMIT) {
writeToDisk(2, bigramToCount);
}
if (trigramToCount.size() > CACHE_LIMIT) {
writeToDisk(3, trigramToCount);
}
}
prevPrev = prev;
prev = token;
}
}
private void writeToDisk(int ngramSize, Map<String, Long> ngramToCount) throws IOException {
System.out.println("Writing " + ngramToCount.size() + " cached ngrams to disk (ngramSize=" + ngramSize + ")...");
FileWriter writer = ngramSizeToWriter.get(ngramSize);
for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
writer.write(entry.getKey() + "\t" + entry.getValue() + "\n");
}
writer.flush();
ngramToCount.clear();
}
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.out.println("Usage: " + CommonCrawlToNgram3.class + " <langCode> <input.xz/bz2> <outputDir>");
System.exit(1);
}
Language language = Languages.getLanguageForShortCode(args[0]);
File input = new File(args[1]);
File outputDir = new File(args[2]);
try (CommonCrawlToNgram3 prg = new CommonCrawlToNgram3(language, input, outputDir)) {
prg.indexInputFile();
}
}
}