/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tallison.lucene.syns; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.HashSet; import java.util.Set; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer; import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.MultiFields; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; import org.tallison.lucene.syns.contextifiers.Contextifier; /** * Builds the initial ngram index from another Lucene index */ class NGramIndexBuilder { private static final Logger LOGGER = LogManager.getLogger(); public static void main(String[] args) throws IOException { Path luceneIndexDir = Paths.get(args[0]); String contentField = args[1]; Path synsIndexDir = Paths.get(args[2]); SyntacticSynsConfig config = new SyntacticSynsConfig(synsIndexDir); config.setMaxKeyPhraseLength(3); NGramIndexBuilder builder = new NGramIndexBuilder(); } public boolean execute(Path sourceLuceneIndexDir, String contentField, SyntacticSynsConfig synsConfig) throws IOException { try (Directory directory = FSDirectory.open(sourceLuceneIndexDir)) { try (IndexReader indexReader = DirectoryReader.open(directory)) { return _execute(indexReader, contentField, synsConfig); } } } private boolean _execute(IndexReader indexReader, String contentField, SyntacticSynsConfig synsConfig) throws IOException { int actualMaxNGram = synsConfig.getMaxKeyPhraseLength() + Math.max(synsConfig.getPreContextSize(), synsConfig.getPostContextSize()); //adding an empty hashset to the standardanalyzer is necessary to prevent stopword removal Analyzer wrapped = synsConfig.getBaseAnalyzer(); Analyzer analyzer = new ShingleAnalyzerWrapper(wrapped, 2, actualMaxNGram, Contextifier.NGRAM_DELIMITER, false, false, Contextifier.FILLER_TOKEN); analyzer = new LimitTokenCountAnalyzer(analyzer, synsConfig.getMaxTokenToReadCount()); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(OpenMode.CREATE); config.setRAMBufferSizeMB(150.0); IndexWriter writer = new IndexWriter(FSDirectory.open(synsConfig.getNGramIndex()), config); Document d = new Document(); TextField field = new TextField(SyntacticSynsConfig.getNgramField(), "", Field.Store.NO); d.add(field); StringBuilder sb = new StringBuilder(); Set<String> fieldsToLoad = new HashSet<>(); fieldsToLoad.add(contentField); Bits liveDocs = MultiFields.getLiveDocs(indexReader); int cnt = 0; for (int i = 0; i < indexReader.maxDoc(); i++) { if (! liveDocs.get(i)) { continue; } Document doc = indexReader.document(i, fieldsToLoad); //can it ever be null? if (doc == null) { continue; } for (String s : doc.getValues(contentField)) { sb.append(s).append(" "); } field.setStringValue(sb.toString()); sb.setLength(0); writer.addDocument(d); if (++i % 100 == 0) { LOGGER.info("processed " + cnt++ + " docs"); } } writer.close(); LOGGER.info("Finished. Processed "+cnt+" docs total."); return true; } }