package org.adsabs.lucene;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.payloads.SpanPayloadCheckQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestRuleLimitSysouts;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.TestUtil;
@TestRuleLimitSysouts.Limit(bytes = 600000)
@SuppressCodecs({"Lucene3x", "SimpleText"})
public class BenchmarkAuthorSearch extends LuceneTestCase{
private IndexSearcher searcher;
private IndexReader reader;
private Directory dir;
private int numDocs = 10000;
private int numQueries = 100;
private boolean store = false;
private long maxTime = 60*1000; // max time benchmark is allowed to run
private ArrayList<ArrayList<Object>> timerStack = new ArrayList<ArrayList<Object>>();
private String[] names = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "x",
"john", "jay", "giovanni", "alberto", "edwin", "michael"};
@Override
public void tearDown() throws Exception {
super.tearDown();
reader.close();
dir.close();
assertTrue(timerStack.size()==0);
}
@Override
public void setUp() throws Exception {
super.setUp();
startTimer("Buiding index of " + numDocs + " docs");
dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MultiFieldAnalyzer())
.setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000)));
Document doc = new Document();
FieldType customType = new FieldType(store ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED);
customType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPayloads(true);
customType.setStoreTermVectorPositions(true);
Field id = newField("id", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
Field original = newField("original", "", StringField.TYPE_STORED);
Field regex = newField("regex", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
Field wildcard = newField("wildcard", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
Field payload = newField("vectrfield", "", customType);
Field n0 = newField("n0", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
Field n1 = newField("n1", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
Field n2 = newField("n2", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
Field n3 = newField("n3", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
Field n4 = newField("n4", "", store ? StringField.TYPE_STORED : StringField.TYPE_NOT_STORED);
doc.add(id);
doc.add(original);
doc.add(regex);
doc.add(wildcard);
doc.add(payload);
doc.add(n0);
doc.add(n1);
doc.add(n2);
doc.add(n3);
doc.add(n4);
Field[] nFields = {n0, n1, n2, n3, n4};
Field[] myFields = {id,original, regex, wildcard, payload, n0, n1, n2, n3, n4};
String surname;
for (int i = 0; i < numDocs; i++) {
for (Field f: myFields) {
f.setStringValue("");
}
StringBuilder name = new StringBuilder();
StringBuilder wild = new StringBuilder();
//surname
do {
surname = TestUtil.randomSimpleString(random()).toLowerCase().replace(",", "").trim();
} while (surname.length() == 0);
name.append(surname);
name.append(", ");
wild.append(surname);
wild.append(", ");
n0.setStringValue(surname);
//#initials
int noi = TestUtil.nextInt(random(), 0, 4);
for (int j = 0; j < noi; j++) {
String namePart = names[TestUtil.nextInt(random(), 0, names.length-1)];
name.append(namePart);
name.append(" ");
wild.append(namePart);
wild.append(j+1);
wild.append(" ");
nFields[j+1].setStringValue(namePart);
}
original.setStringValue(name.toString());
regex.setStringValue(name.toString());
wildcard.setStringValue(wild.toString());
payload.setStringValue(name.toString());
id.setStringValue(Integer.toString(i));
writer.addDocument(doc);
}
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
stopTimer();
}
private void startTimer(String message) {
ArrayList<Object> l = new ArrayList<Object>();
l.add(System.currentTimeMillis());
l.add(message);
timerStack.add(l);
}
private long stopTimer() {
ArrayList<Object> l = timerStack.remove(timerStack.size()-1);
long endTime = System.currentTimeMillis();
long startTime = (Long) l.get(0);
String msg = (String) l.get(1);
long resTime = endTime - startTime;
StringBuilder out = new StringBuilder();
for (int i=0;i<timerStack.size();i++) {
out.append("\t");
}
out.append(resTime);
out.append("ms. " + msg);
System.out.println(out.toString());
return resTime;
}
private void appendToTimer(String msg) {
timerStack.get(timerStack.size()-1).set(1, (timerStack.get(timerStack.size()-1).get(1) + " -- " + msg));
}
/**
* This Analyzer uses an WhitespaceTokenizer and PayloadFilter, OR KeywordTokenizer for
* other queries
*/
private static class MultiFieldAnalyzer extends Analyzer {
public MultiFieldAnalyzer() {
super(PER_FIELD_REUSE_STRATEGY);
}
public MultiFieldAnalyzer(String field, byte[] data, int offset, int length) {
super(PER_FIELD_REUSE_STRATEGY);
}
@Override
public TokenStreamComponents createComponents(String fieldName) {
if (fieldName.contains("vectrfield")){
Tokenizer result = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(result, new SimplePayloadFilter(result));
}
Tokenizer result = new MockTokenizer(MockTokenizer.KEYWORD, true);
return new TokenStreamComponents(result, result);
}
}
/**
* This Filter adds payloads to the tokens.
*/
static final class SimplePayloadFilter extends TokenFilter {
int pos;
final PayloadAttribute payloadAttr;
final CharTermAttribute termAttr;
public SimplePayloadFilter(TokenStream input) {
super(input);
pos = 0;
payloadAttr = input.addAttribute(PayloadAttribute.class);
termAttr = input.addAttribute(CharTermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
payloadAttr.setPayload(new BytesRef((Integer.toString(pos)).getBytes("UTF-8"))); //save the position 0 = surname,1,2,3,4....
pos++;
return true;
} else {
return false;
}
}
@Override
public void reset() throws IOException {
super.reset();
pos = 0;
}
}
public void testBenchMarkAll() throws Exception {
int[] randomIds = getRandomIds(100);
startTimer("Verifying data integrity with " + randomIds.length + " docs");
verifySearch(randomIds);
stopTimer();
startTimer("Preparing " + numQueries + " random queries");
randomIds = getRandomIds(numQueries);
List<TestCase> testCases = getIndexData(randomIds);
stopTimer();
ArrayList<Integer> totals = new ArrayList<Integer>();
System.out.println("\nExamples of queries:\n--------------------");
int e = 0;
int oldLen = 0;
for (int ii = 0; ii<10;ii++) {
if (testCases.get(ii).parts.length > oldLen) {
e = ii;
oldLen = testCases.get(ii).parts.length;
}
}
for (Query q: buildQueries(testCases.get(e).parts)) {
System.out.println(q);
}
System.out.println("");
startTimer("Regexp queries (new style, using \\w*)");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getRegexpQuerySameAsRegex(t.parts, t.howMany, t.truncate);
}
}));
stopTimer();
startTimer("Regexp queries (new style, using [^\\s]*)");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getRegexpQuery(t.parts, t.howMany, t.truncate);
}
}));
stopTimer();
startTimer("Wildcard queries");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getWildcardQuery(t.parts, t.howMany, t.truncate);
}
}));
stopTimer();
startTimer("Boolean queries");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getBooleanQuery(t.parts, t.howMany, t.truncate);
}
}));
stopTimer();
startTimer("Boolean queries (truncated)");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getBooleanQuery(t.parts, t.howMany, true);
}
}));
stopTimer();
startTimer("Span queries");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getSpanQuery(t.parts, t.howMany, false);
}
}));
stopTimer();
startTimer("Span queries (truncated)");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getSpanQuery(t.parts, t.howMany, true);
}
}));
stopTimer();
startTimer("Payload queries");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getPayloadQuery(t.parts, t.howMany, false);
}
}));
stopTimer();
startTimer("Payload queries (truncated)");
totals.add(runQueries(testCases, new QueryBuilder() {
public Query getQuery(TestCase t) throws Exception {
return getPayloadQuery(t.parts, t.howMany, true);
}
}));
stopTimer();
System.out.println("Totals: " + totals);
}
class QueryBuilder {
public Query getQuery(TestCase t) throws Exception {
return new TermQuery(new Term("original", t.original));
}
}
private int runQueries(List<TestCase> testCases, QueryBuilder builder) throws Exception {
long start = System.currentTimeMillis(); // TODO: make smarter
int total = 0;
int rounds = 0;
for (TestCase t: testCases) {
total += searcher.search(builder.getQuery(t), 1).totalHits;
rounds++;
if (rounds % 50 == 0 && (System.currentTimeMillis() - start) > maxTime) {
appendToTimer("Stopping execution, # queries finished: " + rounds);
return total;
}
}
return total;
}
class TestCase {
public String original;
public String[] parts;
public int howMany;
public boolean truncate = false;
TestCase(String original, String[] parts, int howMany) {
this.original = original;
this.parts = parts;
this.howMany = howMany;
}
}
private List<TestCase> getIndexData(int[] randomIds) throws IOException {
ArrayList<TestCase> data = new ArrayList<TestCase>(randomIds.length);
for (int i = 0; i < randomIds.length; i++) {
TopDocs docs = searcher.search(new TermQuery(new Term("id", Integer.toString(randomIds[i]))), 1);
Document doc = reader.document(docs.scoreDocs[0].doc);
String original = doc.get("original").toString();
String[] parts = original.split("\\,? ");
int howMany = TestUtil.nextInt(random(), 0, parts.length-1); // how many initials
if (howMany > 1)
data.add(new TestCase(original, parts, howMany));
}
return data;
}
private void verifySearch(int[] randomIds) throws IOException {
for (int i = 0; i < randomIds.length; i++) {
TopDocs docs = searcher.search(new TermQuery(new Term("id", Integer.toString(randomIds[i]))), 1);
if (docs.totalHits == 1) {
Document doc = reader.document(docs.scoreDocs[0].doc);
String original = doc.getField("original").stringValue();
String[] parts = original.split("\\,? ");
Query[] queries = buildQueries(parts);
if (queries == null)
continue;
TermQuery oq = new TermQuery(new Term("original", original));
int ho = searcher.search(oq, 1).totalHits;
for (Query q: queries) {
if (q == null) continue;
Builder bq = new BooleanQuery.Builder();
bq.add(q, Occur.MUST);
bq.add(new TermQuery(new Term("id", Integer.toString(randomIds[i]))), Occur.MUST);
if (q != null) {
//System.out.println(q.toString());
Query query = bq.build();
//System.out.println(query.toString());
//System.out.println("q: " + searcher.search(q, 10).totalHits);
int no = searcher.search(query, 1).totalHits;
if (no != 1) {
System.out.println("Results differ: " + oq + " <<>> " + q + " [" + ho + " : " + no + "]");
if (store == true) {
System.out.println("wildcard: \"" + doc.getField("wildcard").stringValue() + "\"");
System.out.println("regex: \"" + doc.getField("regex").stringValue() + "\"");
System.out.println("vectrfield: \"" + doc.getField("vectrfield").stringValue() + "\"");
System.out.println("n0: \"" + doc.getField("n0").stringValue() + "\"");
System.out.println("n1: \"" + doc.getField("n1").stringValue() + "\"");
System.out.println("n2: \"" + doc.getField("n2").stringValue() + "\"");
System.out.println("n3: \"" + doc.getField("n3").stringValue() + "\"");
System.out.println("n4: \"" + doc.getField("n4").stringValue() + "\"");
}
}
//assertEquals(ho, no);
}
}
}
}
}
private Query[] buildQueries(String[] parts) throws IOException {
if (parts.length - 1 < 3)
return null;
int howMany = TestUtil.nextInt(random(), 2, parts.length-1); // how many initials
if (howMany < 2)
return null;
Query[] queries = new Query[9];
queries[1] = getRegexpQuery(parts, howMany, false);
queries[2] = getWildcardQuery(parts, howMany, false);
queries[3] = getBooleanQuery(parts, howMany, false);
queries[4] = getBooleanQuery(parts, howMany, true);
queries[5] = getSpanQuery(parts, howMany, false);
queries[6] = getSpanQuery(parts, howMany, true);
queries[7] = getPayloadQuery(parts, howMany, false);
queries[8] = getPayloadQuery(parts, howMany, true);
return queries;
}
private Query getSpanQuery(String[] parts, int howMany, boolean truncate) throws UnsupportedEncodingException {
SpanQuery[] clauses = new SpanQuery[howMany+1];
clauses[0] = new SpanTermQuery(new Term("vectrfield", parts[0])); // surname
for (int i = 0; i < howMany; i++) {
if (truncate) {
SpanMultiTermQueryWrapper<WildcardQuery> q = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("vectrfield", parts[i+1].substring(0, 1) + "*")));
clauses[i+1] = q;
}
else {
clauses[i+1] = new SpanTermQuery(new Term("vectrfield", parts[i+1]));
}
}
SpanNearQuery sq = new SpanNearQuery(clauses, 0, true); // match in order
return sq;
}
private Query getPayloadQuery(String[] parts, int howMany, boolean truncate) throws IOException {
List<BytesRef> payloads = new ArrayList<BytesRef>(howMany+1);
BytesRef pay = new BytesRef((Integer.toString(0)).getBytes("UTF-8"));
payloads.add(pay);
SpanQuery[] clauses = new SpanQuery[howMany+1];
clauses[0] = new SpanTermQuery(new Term("vectrfield", parts[0])); // surname
for (int i = 0; i < howMany; i++) {
if (truncate) {
SpanMultiTermQueryWrapper<WildcardQuery> q = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("vectrfield", parts[i+1].substring(0, 1) + "*")));
clauses[i+1] = (SpanQuery) q.rewrite(searcher.getIndexReader());
//clauses[i+1] = new SpanMultiTermQueryWrapper<PrefixQuery>(new PrefixQuery(new Term("vectrfield", parts[i+1].substring(0, 1))));
}
else {
clauses[i+1] = new SpanTermQuery(new Term("vectrfield", parts[i+1]));
}
payloads.add(new BytesRef((Integer.toString(i+1)).getBytes("UTF-8")));
}
SpanNearQuery sq = new SpanNearQuery(clauses, 1, true); // match in order
return new SpanPayloadCheckQuery(sq, payloads);
}
private Query getBooleanQuery(String[] parts, int howMany, boolean truncate) {
Builder bq = new BooleanQuery.Builder();
bq.add(new TermQuery(new Term("n0", parts[0])), BooleanClause.Occur.MUST);
for (int i = 1; i < howMany+1; i++) {
if (truncate) {
bq.add(new WildcardQuery(new Term("n"+i, parts[i].substring(0,1) + "*")), BooleanClause.Occur.MUST);
}
else {
bq.add(new TermQuery(new Term("n"+i, parts[i])), BooleanClause.Occur.MUST);
}
}
return bq.build();
}
private Query getWildcardQuery(String[] parts, int howMany, boolean truncate) {
return new WildcardQuery(new Term("wildcard", getWildcardQueryString(parts, howMany, truncate)));
}
private String getWildcardQueryString(String[] parts, int howMany, boolean truncate) {
StringBuilder p = new StringBuilder();
p.append(parts[0]);
p.append(", ");
int i = 0;
for (; i < howMany && parts.length > i; i++) {
String x = truncate ? parts[i+1].substring(0, 1) : parts[i+1];
p.append(x + "*" + (i+1) + " ");
}
if (parts.length > i) {
p.append("*");
}
return p.toString();
}
private Query getRegexpQuerySameAsRegex(String[] parts, int howMany, boolean truncate) {
return new RegexpQuery(new Term("regex", getRegexQueryString(parts, howMany, truncate)));
}
private Query getRegexpQuery(String[] parts, int howMany, boolean truncate) {
return new RegexpQuery(new Term("regex", getRegexpQueryString(parts, howMany, truncate)));
}
private String getRegexpQueryString(String[] parts, int howMany, boolean truncate) {
StringBuilder p = new StringBuilder();
p.append(parts[0]);
p.append(", ");
int i = 0;
for (; i < howMany && parts.length > i; i++) {
String x = truncate ? parts[i+1].substring(0, 1) : parts[i+1];
p.append(x + "[^\\s]* ");
}
if (parts.length > i) {
p.append(".*");
}
return p.toString();
}
private String getRegexQueryString(String[] parts, int howMany, boolean truncate) {
StringBuilder p = new StringBuilder();
p.append(parts[0]);
p.append(", ");
int i = 0;
for (; i < howMany && parts.length > i; i++) {
String x = truncate ? parts[i+1].substring(0, 1) : parts[i+1];
p.append(x + "\\w* ");
}
if (parts.length > i) {
p.append(".*");
}
return p.toString();
}
private int[] getRandomIds(int i) {
int[] randomIds = new int[Math.min(numDocs, i)];
for (int j = 0; j < randomIds.length; j++) {
randomIds[j] = TestUtil.nextInt(random(), 0, numDocs-1);
}
return randomIds;
}
}