/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tallison.lucene.syns;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.tallison.lucene.syns.contextifiers.Contextifier;
import org.tallison.lucene.syns.contextifiers.PostContextifier;
import org.tallison.lucene.syns.contextifiers.PreContextifier;
class ContextIndexBuilder {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
}
public boolean execute(SyntacticSynsConfig synsConfig) throws IOException {
int MAXFIELD = 10000;
Analyzer analyzer = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(), MAXFIELD);
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(OpenMode.CREATE);
config.setRAMBufferSizeMB(150.0);
IndexWriter writer = new IndexWriter(FSDirectory.open(synsConfig.getContextIndex()), config);
IndexReader reader = DirectoryReader.open(FSDirectory.open(synsConfig.getNGramIndex()));
Terms terms = MultiFields.getTerms(reader, SyntacticSynsConfig.getNgramField());
// long termCount = 0;
// System.out.println("As a first step, I'll count the number of unique terms. This may take a while.");
// while (terms.next()){
// termCount++ ;
// }
// System.out.println("finished counting terms");
//key and its context must appear in at least this many documents
int minOccur = 1;
Contextifier preContext = new PreContextifier();
Contextifier postContext = new PostContextifier();
StringBuilder sb = new StringBuilder();
int i = 0;
int sum = 0;
TermsEnum termEnum = terms.iterator();
BytesRef ref = termEnum.next();
Document d = new Document();
TextField field = new TextField(SyntacticSynsConfig.getContextField(), "", Field.Store.NO);
d.add(field);
while (ref != null) {
if (termEnum.docFreq() >= minOccur) {
String txt = ref.utf8ToString();
int frq = termEnum.docFreq();
//println ([termEnum.term().text(), termEnum.docFreq()].join(' : '))
sb.append(preContext.convert(txt, frq));
sb.append(" ");
sb.append(postContext.convert(txt, frq));
sb.append(" ");
i++;
if (i > 4000) {
System.out.println("stopping to index doc " + sum);
String string = sb.toString();
field.setStringValue(string);
writer.addDocument(d);
sb.setLength(0);
i = 0;
}
}
sum++;
ref = termEnum.next();
}
String string = sb.toString();
field.setStringValue(string);
writer.addDocument(d);
System.out.println("now I must optimize");
writer.close();
System.out.println("Done!");
return true;
}
}