/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tallison.lucene.syns;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;
import org.tallison.lucene.syns.contextifiers.Contextifier;
/**
* Builds the initial ngram index from another Lucene index
*/
class NGramIndexBuilder {
private static final Logger LOGGER = LogManager.getLogger();
public static void main(String[] args) throws IOException {
Path luceneIndexDir = Paths.get(args[0]);
String contentField = args[1];
Path synsIndexDir = Paths.get(args[2]);
SyntacticSynsConfig config = new SyntacticSynsConfig(synsIndexDir);
config.setMaxKeyPhraseLength(3);
NGramIndexBuilder builder = new NGramIndexBuilder();
}
public boolean execute(Path sourceLuceneIndexDir, String contentField, SyntacticSynsConfig synsConfig) throws IOException {
try (Directory directory = FSDirectory.open(sourceLuceneIndexDir)) {
try (IndexReader indexReader = DirectoryReader.open(directory)) {
return _execute(indexReader, contentField, synsConfig);
}
}
}
private boolean _execute(IndexReader indexReader, String contentField, SyntacticSynsConfig synsConfig) throws IOException {
int actualMaxNGram = synsConfig.getMaxKeyPhraseLength() + Math.max(synsConfig.getPreContextSize(), synsConfig.getPostContextSize());
//adding an empty hashset to the standardanalyzer is necessary to prevent stopword removal
Analyzer wrapped = synsConfig.getBaseAnalyzer();
Analyzer analyzer = new ShingleAnalyzerWrapper(wrapped, 2, actualMaxNGram, Contextifier.NGRAM_DELIMITER,
false, false, Contextifier.FILLER_TOKEN);
analyzer = new LimitTokenCountAnalyzer(analyzer, synsConfig.getMaxTokenToReadCount());
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(OpenMode.CREATE);
config.setRAMBufferSizeMB(150.0);
IndexWriter writer = new IndexWriter(FSDirectory.open(synsConfig.getNGramIndex()), config);
Document d = new Document();
TextField field = new TextField(SyntacticSynsConfig.getNgramField(), "", Field.Store.NO);
d.add(field);
StringBuilder sb = new StringBuilder();
Set<String> fieldsToLoad = new HashSet<>();
fieldsToLoad.add(contentField);
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
int cnt = 0;
for (int i = 0; i < indexReader.maxDoc(); i++) {
if (! liveDocs.get(i)) {
continue;
}
Document doc = indexReader.document(i, fieldsToLoad);
//can it ever be null?
if (doc == null) {
continue;
}
for (String s : doc.getValues(contentField)) {
sb.append(s).append(" ");
}
field.setStringValue(sb.toString());
sb.setLength(0);
writer.addDocument(d);
if (++i % 100 == 0) {
LOGGER.info("processed " + cnt++ + " docs");
}
}
writer.close();
LOGGER.info("Finished. Processed "+cnt+" docs total.");
return true;
}
}