package com.senseidb.clue.test;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.nio.file.FileSystems;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleDocValuesField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.json.JSONObject;
public class BuildSampleIndex {
static void addMetaString(Document doc, String field, String value) {
if (value != null) {
doc.add(new SortedDocValuesField(field, new BytesRef(value)));
doc.add(new StringField(field+"_indexed", value, Store.YES));
}
}
static final String CONTENTS_FIELD = "contents";
static Document buildDoc(JSONObject json) throws Exception{
Document doc = new Document();
doc.add(new NumericDocValuesField("id", json.getLong("id")));
doc.add(new DoubleDocValuesField("price", json.optDouble("price")));
doc.add(new TextField("contents", json.optString("contents"), Store.NO));
doc.add(new NumericDocValuesField("year", json.optInt("year")));
doc.add(new NumericDocValuesField("mileage", json.optInt("mileage")));
addMetaString(doc,"color", json.optString("color"));
addMetaString(doc,"category", json.optString("category"));
addMetaString(doc,"makemodel", json.optString("makemodel"));
addMetaString(doc,"city", json.optString("city"));
String tagsString = json.optString("tags");
if (tagsString != null) {
String[] parts = tagsString.split(",");
if (parts.length > 0) {
for (String part : parts) {
doc.add(new SortedSetDocValuesField("tags", new BytesRef(part)));
doc.add(new StringField("tags_indexed", part, Store.NO));
}
}
// store everything
FieldType ft = new FieldType();
ft.setOmitNorms(false);
ft.setTokenized(true);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPayloads(true);
ft.setStoreTermVectorPositions(true);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field tagPayload = new Field("tags_payload", new PayloadTokenizer(tagsString), ft);
doc.add(tagPayload);
}
doc.add(new BinaryDocValuesField("json", new BytesRef(json.toString())));
return doc;
}
/**
* @param args
*/
public static void main(String[] args) throws Exception{
if (args.length != 2) {
System.out.println("usage: source_file index_dir");
}
File f = new File(args[0]);
BufferedReader reader = new BufferedReader(new FileReader(f));
IndexWriterConfig idxWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
Directory dir = FSDirectory.open(FileSystems.getDefault().getPath(args[1]));
IndexWriter writer = new IndexWriter(dir, idxWriterConfig);
int count = 0;
while (true) {
String line = reader.readLine();
if (line == null) break;
JSONObject json = new JSONObject(line);
Document doc = buildDoc(json);
writer.addDocument(doc);
count++;
if (count % 100 == 0) {
System.out.print(".");
}
}
System.out.println(count+" docs indexed");
reader.close();
writer.commit();
writer.close();
}
}