/**
* Copyright 2008 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.sf.katta.tool;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.UUID;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* Generates a test index, for example used for benchmarking
*/
public class SampleIndexGenerator {
public void createIndex(String input, String output, int wordsPerDoc, int indexSize) {
createIndex(getWordList(input), output, wordsPerDoc, indexSize);
}
public void createIndex(String[] wordList, String output, int wordsPerDoc, int indexSize) {
long startTime = System.currentTimeMillis();
String hostname = "unknown";
InetAddress addr;
try {
addr = InetAddress.getLocalHost();
hostname = addr.getHostName();
} catch (UnknownHostException e) {
throw new RuntimeException("Unable to get localhostname", e);
}
File index = new File(output, hostname + "-" + UUID.randomUUID().toString());
int count = wordList.length;
Random random = new Random(System.currentTimeMillis());
try {
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), new StandardAnalyzer(Version.LUCENE_35), true,
MaxFieldLength.UNLIMITED);
for (int i = 0; i < indexSize; i++) {
// generate text first
StringBuffer text = new StringBuffer();
for (int j = 0; j < wordsPerDoc; j++) {
text.append(wordList[random.nextInt(count)]);
text.append(" ");
}
Document document = new Document();
document.add(new Field("key", hostname + "_" + i, Store.NO, Index.NOT_ANALYZED));
document.add(new Field("text", text.toString(), Store.NO, Index.ANALYZED));
indexWriter.addDocument(document);
}
indexWriter.optimize();
indexWriter.close();
System.out.println("Index created with : " + indexSize + " documents in "
+ (System.currentTimeMillis() - startTime) + " ms");
// when we are ready we move the index to the final destination and write
// a done flag file we can use in shell scripts to identify the move is
// done.
new File(index, "done").createNewFile();
} catch (Exception e) {
throw new RuntimeException("Unable to write index", e);
}
}
/**
* creates a disctionary of words based on the input text.
*
* @throws IOException
*/
private String[] getWordList(String input) {
try {
Set<String> hashSet = new HashSet<String>();
BufferedReader in = new BufferedReader(new FileReader(input));
String str;
while ((str = in.readLine()) != null) {
String[] words = str.split(" ");
for (String word : words) {
hashSet.add(word);
}
}
in.close();
return hashSet.toArray(new String[hashSet.size()]);
} catch (IOException e) {
throw new RuntimeException("Unable to read sample text", e);
}
}
}