/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.languagemodel.LanguageModel;
import java.io.*;
import java.text.NumberFormat;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.GZIPInputStream;
/**
* Index *.gz files from Google's ngram corpus into a Lucene index ('text' mode)
* or aggregate them to plain text files ('lucene' mode).
* Index time (1 doc = 1 ngram and its count, years are aggregated into one number):
* 130µs/doc (both on an external USB hard disk or on an internal SSD) = about 7700 docs/sec
*
* <p>The reason this isn't faster is not Lucene but the aggregation work we do or simply
* the large amount of data. Indexing every line takes 3µs/doc, i.e. Lucene can
* index about 333,000 docs/s.
*
* <p>Also see http://wiki.languagetool.org/finding-errors-using-big-data.
* @since 2.7
*/
public class FrequencyIndexCreator {
private static final int MIN_YEAR = 1910;
private static final String NAME_REGEX1 = "googlebooks-[a-z]{3}-all-[1-5]gram-20120701-(.*?).gz";
private static final String NAME_REGEX2 = "[a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+[_-](.*?).gz"; // Hive result
private static final String NAME_REGEX3 = "([_a-z0-9]{1,2}|other|pos|punctuation|_(ADJ|ADP|ADV|CONJ|DET|NOUN|NUM|PRON|PRT|VERB)_)"; // result of FrequencyIndexCreator with text mode
private static final int BUFFER_SIZE = 16384;
private static final String LT_COMPLETE_MARKER = "languagetool_index_complete";
private static final boolean IGNORE_POS = true;
private enum Mode { PlainText, Lucene }
private final AtomicLong bytesProcessed = new AtomicLong(0);
private final Mode mode;
private long totalTokenCount;
private long inputFileCount;
public FrequencyIndexCreator(Mode mode) {
this.mode = mode;
}
private void run(File inputDir, File indexBaseDir) throws Exception {
if (!inputDir.exists()) {
throw new RuntimeException("Not found: " + inputDir);
}
List<File> files = Arrays.asList(inputDir.listFiles());
long totalBytes = files.stream().mapToLong(File::length).sum();
System.out.println("Total input bytes: " + totalBytes);
//Collections.sort(files); use for non-parallel streams
// use this to get one index per input file:
//files.parallelStream().forEach(dir -> index(dir, indexBaseDir, totalBytes, files.size(), null));
// use this to get one large index for all input files:
DataWriter dw;
if (mode == Mode.PlainText) {
dw = new TextDataWriter(indexBaseDir);
} else {
dw = new LuceneDataWriter(indexBaseDir);
}
try {
files.parallelStream().forEach(dir -> index(dir, indexBaseDir, totalBytes, files.size(), dw));
markIndexAsComplete(indexBaseDir);
} finally {
dw.close();
}
}
private void index(File file, File indexBaseDir, long totalBytes, int inputFiles, DataWriter globalDataWriter) {
System.out.println(file);
String name = file.getName();
//if (file.list().length == 1) {
// System.out.println("Ignoring empty dir " + file);
// return;
//}
if (IGNORE_POS && name.matches(".*_[A-Z]+_.*")) {
System.out.println("Skipping POS tag file " + name);
return;
}
File indexDir;
boolean hiveMode;
if (name.matches(NAME_REGEX1)) {
indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX1, "$1"));
hiveMode = false;
System.out.println("Running in corpus mode (i.e. aggregation of years)");
} else if (name.matches(NAME_REGEX2)) {
indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX2, "$1"));
hiveMode = true;
System.out.println("Running in Hive mode (i.e. no aggregation of years)");
} else if (name.matches(NAME_REGEX3) && file.isDirectory()) {
file = new File(file, file.getName() + "-output.csv.gz");
indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX1, "$1"));
hiveMode = true;
System.out.println("Running in Hive/Text mode (i.e. no aggregation of years)");
} else {
System.out.println("Skipping " + name + " - doesn't match regex " + NAME_REGEX1 + ", " + NAME_REGEX2 + ", or " + NAME_REGEX3);
return;
}
if (indexDir.exists() && indexDir.isDirectory()) {
if (isIndexComplete(indexDir)) {
System.out.println("Skipping " + name + " - index dir '" + indexDir + "' already exists and is complete");
bytesProcessed.addAndGet(file.length());
return;
} else {
System.out.println("Not skipping " + name + " - index dir '" + indexDir + "' already exists but is not complete");
}
}
System.out.println("Index dir: " + indexDir + " - " + (++inputFileCount) + " of " + inputFiles);
try {
if (mode == Mode.PlainText) {
if (globalDataWriter != null) {
indexLinesFromGoogleFile(globalDataWriter, file, totalBytes, hiveMode);
} else {
try (DataWriter dw = new TextDataWriter(indexDir)) {
indexLinesFromGoogleFile(dw, file, totalBytes, hiveMode);
}
markIndexAsComplete(indexDir);
}
} else {
if (globalDataWriter != null) {
indexLinesFromGoogleFile(globalDataWriter, file, totalBytes, hiveMode);
} else {
try (DataWriter dw = new LuceneDataWriter(indexDir)) {
indexLinesFromGoogleFile(dw, file, totalBytes, hiveMode);
}
markIndexAsComplete(indexDir);
}
}
} catch (Exception e) {
throw new RuntimeException("Could not index " + file, e);
}
bytesProcessed.addAndGet(file.length());
}
private void markIndexAsComplete(File directory) throws IOException {
try (FileWriter fw = new FileWriter(new File(directory, LT_COMPLETE_MARKER))) {
fw.write(new Date().toString());
}
}
private boolean isIndexComplete(File directory) {
return new File(directory, LT_COMPLETE_MARKER).exists();
}
private void indexLinesFromGoogleFile(DataWriter writer, File inputFile, long totalBytes, boolean hiveMode) throws IOException {
float progress = (float)bytesProcessed.get() / totalBytes * 100;
System.out.printf("==== Working on " + inputFile + " (%.2f%%) ====\n", progress);
try (
InputStream fileStream = new FileInputStream(inputFile);
InputStream gzipStream = new GZIPInputStream(fileStream, BUFFER_SIZE);
Reader decoder = new InputStreamReader(gzipStream, "utf-8");
BufferedReader buffered = new BufferedReader(decoder, BUFFER_SIZE)
) {
int i = 0;
long docCount = 0;
long lineCount = 0;
String prevText = null;
long startTime = System.nanoTime()/1000;
String line;
//noinspection NestedAssignment
while ((line = buffered.readLine()) != null) {
lineCount++;
// To create a smaller index just for testing, comment in this. For there/their
// with the v1 Google ngram data, the index will be 110MB (instead of 3.1GB with all words):
//if (!line.matches(".*\\b([Tt]here|[Tt]heir)\\b.*")) {
// continue;
//}
String[] parts = line.split("\t");
String text = parts[0];
if (IGNORE_POS && isRealPosTag(text)) { // filtering '_VERB_', 'Italian_ADJ', etc.
continue;
}
if (hiveMode) {
if (parts.length <= 1) {
System.err.println("Could not index: " + line);
continue;
}
String docCountStr = parts[1];
writer.addDoc(text, Long.parseLong(docCountStr));
if (++i % 500_000 == 0) {
printStats(i, inputFile, Long.parseLong(docCountStr), lineCount, text, startTime, totalBytes);
}
} else {
int year = Integer.parseInt(parts[1]);
if (year < MIN_YEAR) {
continue;
}
if (prevText == null || prevText.equals(text)) {
// aggregate years
docCount += Long.parseLong(parts[2]);
} else {
//System.out.println(">"+ prevText + ": " + count);
writer.addDoc(prevText, docCount);
if (++i % 5_000 == 0) {
printStats(i, inputFile, docCount, lineCount, prevText, startTime, totalBytes);
}
docCount = Long.parseLong(parts[2]);
}
}
prevText = text;
}
printStats(i, inputFile, docCount, lineCount, prevText, startTime, totalBytes);
}
writer.addTotalTokenCountDoc(totalTokenCount);
}
private boolean isRealPosTag(String text) {
int idx = text.indexOf('_');
if (idx == -1) {
return false;
} else {
String tag = idx + 7 <= text.length() ? text.substring(idx, idx + 7) : ""; // _START_
if (tag.equals(LanguageModel.GOOGLE_SENTENCE_START)) {
return false;
}
String tag2 = idx + 5 <= text.length() ? text.substring(idx, idx + 5) : ""; // _END_
if (tag2.equals(LanguageModel.GOOGLE_SENTENCE_END)) {
return false;
}
return true;
}
}
private void printStats(int i, File inputFile, long docCount, long lineCount, String prevText, long startTimeMicros, long totalBytes) {
long microsNow = System.nanoTime()/1000;
float millisPerDoc = (microsNow-startTimeMicros)/Math.max(1, i);
NumberFormat format = NumberFormat.getNumberInstance(Locale.US);
float progress = (float)bytesProcessed.get() / totalBytes * 100;
System.out.printf("%.2f%% input:%s doc:%s line:%s ngram:%s occ:%s (%.0fµs/doc)\n",
progress, inputFile.getName(), format.format(i), format.format(lineCount),
prevText, format.format(docCount), millisPerDoc);
}
abstract static class DataWriter implements AutoCloseable {
abstract void addDoc(String text, long count) throws IOException;
abstract void addTotalTokenCountDoc(long totalTokenCount) throws IOException;
}
class LuceneDataWriter extends DataWriter {
IndexWriter writer;
LuceneDataWriter(File indexDir) throws IOException {
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setUseCompoundFile(false); // ~10% speedup
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
//config.setRAMBufferSizeMB(1000);
Directory directory = FSDirectory.open(indexDir.toPath());
writer = new IndexWriter(directory, config);
}
@Override
void addDoc(String text, long count) throws IOException {
if (text.length() > 1000) {
System.err.println("Ignoring doc, ngram is > 1000 chars: " + text.substring(0, 50) + "...");
} else {
Document doc = new Document();
doc.add(new Field("ngram", text, StringField.TYPE_NOT_STORED));
FieldType fieldType = new FieldType();
fieldType.setStored(true);
Field countField = new Field("count", String.valueOf(count), fieldType);
doc.add(countField);
totalTokenCount += count;
writer.addDocument(doc);
}
}
@Override
void addTotalTokenCountDoc(long totalTokenCount) throws IOException {
FieldType fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS);
fieldType.setStored(true);
Field countField = new Field("totalTokenCount", String.valueOf(totalTokenCount), fieldType);
Document doc = new Document();
doc.add(countField);
writer.addDocument(doc);
}
@Override
public void close() throws Exception {
if (writer != null) {
writer.close();
}
}
}
static class TextDataWriter extends DataWriter {
private final FileWriter fw;
private final BufferedWriter writer;
TextDataWriter(File indexDir) throws IOException {
if (indexDir.exists()) {
System.out.println("Using existing dir: " + indexDir.getAbsolutePath());
} else {
boolean mkdir = indexDir.mkdir();
if (!mkdir) {
throw new RuntimeException("Could not create: " + indexDir.getAbsolutePath());
}
}
fw = new FileWriter(new File(indexDir, indexDir.getName() + "-output.csv"));
writer = new BufferedWriter(fw);
}
@Override
void addDoc(String text, long count) throws IOException {
fw.write(text + "\t" + count + "\n");
}
@Override
void addTotalTokenCountDoc(long totalTokenCount) throws IOException {
System.err.println("Note: not writing totalTokenCount (" + totalTokenCount + ") in file mode");
}
@Override
public void close() throws Exception {
if (fw != null) {
fw.close();
}
writer.close();
}
}
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.out.println("Usage: " + FrequencyIndexCreator.class.getSimpleName() + " <text|lucene> <inputDir> <outputDir>");
System.out.println(" <text|lucene> 'text' will write plain text files, 'lucene' will write Lucene indexes");
System.out.println(" <inputDir> is the Google ngram data, optionally already aggregated by Hive (lucene mode),");
System.out.println(" please see http://wiki.languagetool.org/finding-errors-using-big-data");
System.exit(1);
}
Mode mode;
if (args[0].equals("text")) {
mode = Mode.PlainText;
} else if (args[0].equals("lucene")) {
mode = Mode.Lucene;
} else {
throw new RuntimeException("Unknown mode: " + args[0]);
}
FrequencyIndexCreator creator = new FrequencyIndexCreator(mode);
System.out.println("Mode: " + mode);
System.out.println("Minimum year: " + MIN_YEAR);
System.out.println("Ignore POS tags: " + IGNORE_POS);
creator.run(new File(args[1]), new File(args[2]));
}
}