/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tallison.lucene.syns; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.tallison.lucene.syns.contextifiers.Contextifier; import org.tallison.lucene.syns.contextifiers.PostContextifier; import org.tallison.lucene.syns.contextifiers.PreContextifier; class ContextIndexBuilder { public static void main(String[] args) throws IOException { // TODO Auto-generated method stub } public boolean execute(SyntacticSynsConfig synsConfig) throws IOException { int MAXFIELD = 10000; Analyzer analyzer = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(), MAXFIELD); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(OpenMode.CREATE); config.setRAMBufferSizeMB(150.0); IndexWriter writer = new IndexWriter(FSDirectory.open(synsConfig.getContextIndex()), config); IndexReader reader = DirectoryReader.open(FSDirectory.open(synsConfig.getNGramIndex())); Terms terms = MultiFields.getTerms(reader, SyntacticSynsConfig.getNgramField()); // long termCount = 0; // System.out.println("As a first step, I'll count the number of unique terms. This may take a while."); // while (terms.next()){ // termCount++ ; // } // System.out.println("finished counting terms"); //key and its context must appear in at least this many documents int minOccur = 1; Contextifier preContext = new PreContextifier(); Contextifier postContext = new PostContextifier(); StringBuilder sb = new StringBuilder(); int i = 0; int sum = 0; TermsEnum termEnum = terms.iterator(); BytesRef ref = termEnum.next(); Document d = new Document(); TextField field = new TextField(SyntacticSynsConfig.getContextField(), "", Field.Store.NO); d.add(field); while (ref != null) { if (termEnum.docFreq() >= minOccur) { String txt = ref.utf8ToString(); int frq = termEnum.docFreq(); //println ([termEnum.term().text(), termEnum.docFreq()].join(' : ')) sb.append(preContext.convert(txt, frq)); sb.append(" "); sb.append(postContext.convert(txt, frq)); sb.append(" "); i++; if (i > 4000) { System.out.println("stopping to index doc " + sum); String string = sb.toString(); field.setStringValue(string); writer.addDocument(d); sb.setLength(0); i = 0; } } sum++; ref = termEnum.next(); } String string = sb.toString(); field.setStringValue(string); writer.addDocument(d); System.out.println("now I must optimize"); writer.close(); System.out.println("Done!"); return true; } }