/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jetbrains.annotations.NotNull;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* Indexing the result of {@link CommonCrawlNGramJob} with Lucene.
* @since 3.2
*/
class AggregatedNgramToLucene implements AutoCloseable {
private final Map<Integer, LuceneIndex> indexes = new HashMap<>();
private long totalTokenCount = 0;
private long lineCount = 0;
AggregatedNgramToLucene(File indexTopDir) throws IOException {
indexes.put(1, new LuceneIndex(new File(indexTopDir, "1grams")));
indexes.put(2, new LuceneIndex(new File(indexTopDir, "2grams")));
indexes.put(3, new LuceneIndex(new File(indexTopDir, "3grams")));
}
@Override
public void close() throws IOException {
for (LuceneIndex index : indexes.values()) {
index.close();
}
}
void indexInputFile(File file) throws IOException {
System.out.println("=== Indexing " + file + " ===");
try (Scanner scanner = new Scanner(file)) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
indexLine(line);
}
}
}
private void indexLine(String line) throws IOException {
if (lineCount++ % 250_000 == 0) {
System.out.printf(Locale.ENGLISH, "Indexing line %d\n", lineCount);
}
String[] lineParts = line.split("\t");
if (lineParts.length != 2) {
System.err.println("Not 2 parts but " + lineParts.length + ", ignoring: '" + line + "'");
return;
}
String ngram = lineParts[0];
String[] ngramParts = ngram.split(" ");
LuceneIndex index = indexes.get(ngramParts.length);
if (index == null) {
throw new RuntimeException("No ngram data found for: " + Arrays.toString(lineParts));
}
long count = Long.parseLong(lineParts[1]);
if (ngramParts.length == 1) {
totalTokenCount += count;
}
index.indexWriter.addDocument(getDoc(ngram, count));
}
@NotNull
private Document getDoc(String ngram, long count) {
Document doc = new Document();
doc.add(new Field("ngram", ngram, StringField.TYPE_NOT_STORED)); // use StringField.TYPE_STORED for easier debugging with e.g. Luke
doc.add(getCountField(count));
return doc;
}
@NotNull
private LongField getCountField(long count) {
FieldType fieldType = new FieldType();
fieldType.setStored(true);
fieldType.setOmitNorms(true);
fieldType.setNumericType(FieldType.NumericType.LONG);
fieldType.setDocValuesType(DocValuesType.NUMERIC);
return new LongField("count", count, fieldType);
}
private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
FieldType fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS);
fieldType.setStored(true);
fieldType.setOmitNorms(true);
Field countField = new Field("totalTokenCount", String.valueOf(totalTokenCount), fieldType);
Document doc = new Document();
doc.add(countField);
writer.addDocument(doc);
}
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.out.println("Usage: " + AggregatedNgramToLucene.class + " <inputDir>");
System.out.println(" <inputDir> is a directory with aggregated ngram files from Hadoop, e.g. produced by CommonCrawlNGramJob");
System.exit(1);
}
File inputDir = new File(args[0]);
File outputDir = new File(inputDir, "index");
System.out.println("Indexing to " + outputDir);
try (AggregatedNgramToLucene prg = new AggregatedNgramToLucene(outputDir)) {
for (File file : inputDir.listFiles()) {
if (file.isFile()) {
prg.indexInputFile(file);
}
}
prg.addTotalTokenCountDoc(prg.totalTokenCount, prg.indexes.get(1).indexWriter);
}
}
static class LuceneIndex {
private final Directory directory;
private final IndexWriter indexWriter;
LuceneIndex(File dir) throws IOException {
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
directory = FSDirectory.open(dir.toPath());
indexWriter = new IndexWriter(directory, config);
}
void close() throws IOException {
indexWriter.close();
directory.close();
}
}
}