package perf;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;
import org.apache.lucene.util.*;
import org.apache.lucene.search.spell.*;
import java.io.IOException;
import java.io.File;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.io.FileOutputStream;
import java.util.Random;
import java.util.Set;
import java.util.HashSet;
// inspired by: http://code.google.com/a/apache-extras.org/p/luceneutil/source/browse/perf/CreateQueries.java
// TODO
// - maybe run the query and if it produces too few results, nuke it? (eg AndHighMed)
// - must dedup -- make sure no query is repeated
// - would be nice to do 3-word phrase queries too?
public class CreateQueries {
public static class TermFreq {
BytesRef term;
long df;
public TermFreq(BytesRef term, long df) {
this.term = BytesRef.deepCopyOf(term);
this.df = df;
}
}
private static class MostFrequentTerms extends PriorityQueue<TermFreq> {
public MostFrequentTerms(int maxSize) {
super(maxSize, false);
}
@Override
protected boolean lessThan(TermFreq tf1, TermFreq tf2) {
return tf1.df < tf2.df;
}
}
// Number of queries for each type (ie 500 BooleanOrQuery, 500 PhraseQuery, etc.):
private final static int NUM_QUERIES = 500;
private final static int TOP_N = 50000;
public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println();
System.out.println("Usage: java perf.CreateQueries /path/to/shingled/index fieldName queriesFileOut");
System.exit(1);
}
final String indexPath = args[0];
final String field = args[1];
final String queriesFileOut = args[2];
final BufferedWriter queriesOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(queriesFileOut),"UTF8"));
final Directory dir = FSDirectory.open(new File(indexPath).toPath());
final IndexReader r = DirectoryReader.open(dir);
System.out.println("\nFind top df terms...");
// First pass: get high/medium/low freq terms:
final TermFreq[] topTerms = getTopTermsByDocFreq(r, field, TOP_N, false);
final long maxDF = topTerms[0].df;
int counter = 0;
String prefix = "High";
for(int idx=0;idx<topTerms.length;idx++) {
final TermFreq tf = topTerms[idx];
if (tf.df >= maxDF/10) {
prefix = "High";
} else if (tf.df >= maxDF/100) {
if (!prefix.equals("Med")) {
counter = 0;
}
prefix = "Med";
} else {
if (!prefix.equals("Low")) {
counter = 0;
}
prefix = "Low";
}
if (counter++ < 500) {
queriesOut.write(prefix + "Term" + ": " + tf.term.utf8ToString() + " # freq=" + tf.df + "\n");
}
}
int upto = 1;
while(topTerms[upto].df >= maxDF/10) {
upto++;
}
final TermFreq[] highFreqTerms = new TermFreq[upto];
System.arraycopy(topTerms, 0, highFreqTerms, 0, highFreqTerms.length);
while(topTerms[upto].df >= maxDF/100) {
upto++;
}
final TermFreq[] mediumFreqTerms = new TermFreq[upto - highFreqTerms.length];
System.arraycopy(topTerms, highFreqTerms.length, mediumFreqTerms, 0, mediumFreqTerms.length);
int downTo = topTerms.length-1;
while(topTerms[downTo].df < maxDF/1000) {
downTo--;
}
downTo++;
final TermFreq[] lowFreqTerms = new TermFreq[topTerms.length - downTo];
System.arraycopy(topTerms, downTo, lowFreqTerms, 0, lowFreqTerms.length);
final Random random = new Random(1742);
System.out.println(" " + highFreqTerms.length + " high freq terms");
System.out.println(" " + mediumFreqTerms.length + " medium freq terms");
System.out.println(" " + lowFreqTerms.length + " low freq terms");
makePrefixQueries(mediumFreqTerms, queriesOut);
makeNRQs(random, queriesOut);
makeAndOrQueries(random, highFreqTerms, mediumFreqTerms, lowFreqTerms, queriesOut);
makeWildcardQueries(topTerms, queriesOut);
processShingles(r, field, queriesOut);
makeFuzzyAndRespellQueries(r, field, topTerms, queriesOut);
queriesOut.close();
r.close();
dir.close();
}
private static void makeNRQs(Random random, Writer queriesOut) throws IOException {
// Add in some numeric range queries:
for(int idx=0;idx<NUM_QUERIES;idx++) {
// Seconds in the day 0..86400
final int gap = 30000 + random.nextInt(56400);
final int start = random.nextInt(86400-gap);
queriesOut.write("IntNRQ: nrq//timesecnum " + start + " " + (start+gap) + "\n");
}
queriesOut.flush();
}
private static void makePrefixQueries(TermFreq[] terms, Writer queriesOut) throws IOException {
final Set<String> seen = new HashSet<String>();
int idx = 0;
int lastCount = 0;
while(seen.size() < NUM_QUERIES) {
if (idx == terms.length) {
throw new RuntimeException("not enough unique prefixes");
}
final String term = terms[idx++].term.utf8ToString();
if (term.length() >= 3) {
String pref = term.substring(0, 3);
if (!seen.contains(pref)) {
seen.add(pref);
queriesOut.write("Prefix3: " + pref + "*\n");
}
}
}
queriesOut.flush();
}
private static void makeWildcardQueries(TermFreq[] terms, Writer queriesOut) throws IOException {
final Set<String> seen = new HashSet<String>();
int idx = 0;
int lastCount = 0;
while(seen.size() < NUM_QUERIES) {
if (idx == terms.length) {
throw new RuntimeException("not enough unique prefixes");
}
final String term = terms[idx++].term.utf8ToString();
if (term.length() >= 3) {
String wc = term.substring(0, 2) + "*" + term.substring(term.length()-1);
if (!seen.contains(wc)) {
seen.add(wc);
queriesOut.write("Wildcard: " + wc + "\n");
}
}
}
queriesOut.flush();
}
private static void makeFuzzyAndRespellQueries(IndexReader r, String field, TermFreq[] topTerms, Writer queriesOut) throws IOException {
System.out.println("\nFind top fuzzy/respell terms...");
final DirectSpellChecker spellChecker = new DirectSpellChecker();
spellChecker.setThresholdFrequency(1.0f);
final MostFrequentTerms pq = new MostFrequentTerms(NUM_QUERIES);
// TODO: use threads...?
int count = 0;
for(TermFreq tdf : topTerms) {
if ((++count) % 1000 == 0) {
System.out.println(" " + count + " of " + topTerms.length + "...");
}
if (tdf.term.length < 5) {
continue;
}
// TODO: make my own fuzzy enum?
long sumDF = 0;
SuggestWord[] suggested = spellChecker.suggestSimilar(new Term(field, tdf.term), 50, r, SuggestMode.SUGGEST_MORE_POPULAR);
if (suggested.length < 5) {
continue;
}
for(SuggestWord suggest : suggested) {
sumDF += suggest.freq;
}
// Strongly favor higher number of suggestions and gently favor higher sumDF:
final long score = (long) (Math.log(sumDF) * suggested.length);
final TermFreq newTF = new TermFreq(tdf.term, score);
final TermFreq bumpedTF = pq.insertWithOverflow(newTF);
if (bumpedTF != newTF) {
System.out.println(" " + newTF.term.utf8ToString() + " score=" + score + " suggestCount=" + suggested.length);
}
}
if (pq.size() < NUM_QUERIES) {
throw new RuntimeException("index is too small: only " + pq.size() + " top fuzzy terms");
}
int downTo = NUM_QUERIES;
while (pq.size()>0) {
TermFreq tdf = pq.pop();
System.out.println(" " + tdf.term.utf8ToString() + " freq=" + tdf.df);
queriesOut.write("Fuzzy1: " + tdf.term.utf8ToString() + "~1\n");
queriesOut.write("Fuzzy2: " + tdf.term.utf8ToString() + "~2\n");
queriesOut.write("Respell: " + tdf.term.utf8ToString() + "\n");
}
queriesOut.flush();
}
private static void makeAndOrQueries(Random random, TermFreq[] highFreqTerms, TermFreq[] mediumFreqTerms, TermFreq[] lowFreqTerms, Writer queriesOut) throws IOException {
final Set<String> seen = new HashSet<String>();
// +high +high
int count = 0;
while(count < NUM_QUERIES) {
int idx1 = random.nextInt(highFreqTerms.length);
int idx2 = idx1;
while(idx2 == idx1) {
idx2 = random.nextInt(highFreqTerms.length);
}
if (idx1 > idx2) {
final int sav = idx1;
idx1 = idx2;
idx2 = sav;
}
final TermFreq high1 = highFreqTerms[idx1];
final TermFreq high2 = highFreqTerms[idx2];
final String query = "+" + high1.term.utf8ToString() + " +" + high2.term.utf8ToString();
if (!seen.contains(query)) {
seen.add(query);
count++;
queriesOut.write("AndHighHigh: " + query + " # freq=" + high1.df + " freq=" + high2.df + " " + String.format("%.1f", ((float) high1.df) / high2.df) + "\n");
}
}
// +high +med
count = 0;
while(count < NUM_QUERIES) {
final int idx1 = random.nextInt(highFreqTerms.length);
final int idx2 = random.nextInt(mediumFreqTerms.length);
final TermFreq high = highFreqTerms[idx1];
final TermFreq medium = mediumFreqTerms[idx2];
final String query = "+" + high.term.utf8ToString() + " +" + medium.term.utf8ToString();
if (!seen.contains(query)) {
seen.add(query);
count++;
queriesOut.write("AndHighMed: " + query + " # freq=" + high.df + " freq=" + medium.df + "\n");
}
}
// +high +low
count = 0;
while(count < NUM_QUERIES) {
final int idx1 = random.nextInt(highFreqTerms.length);
final int idx2 = random.nextInt(lowFreqTerms.length);
final TermFreq high = highFreqTerms[idx1];
final TermFreq low = lowFreqTerms[idx2];
final String query = "+" + high.term.utf8ToString() + " +" + low.term.utf8ToString();
if (!seen.contains(query)) {
seen.add(query);
count++;
queriesOut.write("AndHighLow: " + query + " # freq=" + high.df + " freq=" + low.df + "\n");
}
}
// high high
count = 0;
while(count < NUM_QUERIES) {
int idx1 = random.nextInt(highFreqTerms.length);
int idx2 = idx1;
while(idx2 == idx1) {
idx2 = random.nextInt(highFreqTerms.length);
}
if (idx1 > idx2) {
final int sav = idx1;
idx1 = idx2;
idx2 = sav;
}
final TermFreq high1 = highFreqTerms[idx1];
final TermFreq high2 = highFreqTerms[idx2];
final String query = high1.term.utf8ToString() + " " + high2.term.utf8ToString();
if (!seen.contains(query)) {
seen.add(query);
count++;
queriesOut.write("OrHighHigh: " + query + " # freq=" + high1.df + " freq=" + high2.df + "\n");
}
}
// high med
count = 0;
while(count < NUM_QUERIES) {
final int idx1 = random.nextInt(highFreqTerms.length);
final int idx2 = random.nextInt(mediumFreqTerms.length);
final TermFreq high = highFreqTerms[idx1];
final TermFreq medium = mediumFreqTerms[idx2];
final String query = high.term.utf8ToString() + " " + medium.term.utf8ToString();
if (!seen.contains(query)) {
seen.add(query);
count++;
queriesOut.write("OrHighMed: " + query + " # freq=" + high.df + " freq=" + medium.df + "\n");
}
}
// high low
count = 0;
while(count < NUM_QUERIES) {
final int idx1 = random.nextInt(highFreqTerms.length);
final int idx2 = random.nextInt(lowFreqTerms.length);
final TermFreq high = highFreqTerms[idx1];
final TermFreq low = lowFreqTerms[idx2];
final String query = high.term.utf8ToString() + " " + low.term.utf8ToString();
if (!seen.contains(query)) {
seen.add(query);
count++;
queriesOut.write("OrHighLow: " + query + " # freq=" + high.df + " freq=" + low.df + "\n");
}
}
queriesOut.flush();
}
public static TermFreq[] getTopTermsByDocFreq(IndexReader r, String field, int topN, boolean doShingles) throws IOException {
final MostFrequentTerms pq = new MostFrequentTerms(topN);
Terms terms = MultiFields.getTerms(r, field);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
String term = termsEnum.term().utf8ToString();
if (term.indexOf(':') != -1) {
continue;
}
final boolean isShingle = term.indexOf(' ') != -1;
if (isShingle && (term.startsWith("_ ") || term.endsWith(" _"))) {
// A hole!
continue;
}
if (isShingle == doShingles) {
pq.insertWithOverflow(new TermFreq(termsEnum.term(), termsEnum.docFreq()));
}
}
} else {
throw new RuntimeException("field '" + field + "' does not exist");
}
if (pq.size() < topN) {
throw new RuntimeException("index is too small: only " + pq.size() + " unique terms");
}
final TermFreq[] topTerms = new TermFreq[topN];
int downTo = topN-1;
while (pq.size()>0) {
topTerms[downTo--] = pq.pop();
}
return topTerms;
}
private static void processShingles(IndexReader r, String field, Writer queriesOut) throws IOException {
System.out.println("\nFind phrase queries...");
// First pass: get high/medium freq shingles:
final TermFreq[] topShingles = getTopTermsByDocFreq(r, field, TOP_N, true);
long topDF = topShingles[0].df;
int upto = 0;
int counter = 0;
while(topShingles[upto].df >= topDF/10) {
final TermFreq tf = topShingles[upto];
String [] terms = tf.term.utf8ToString().split(" ");
if (terms.length != 2) {
throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
}
int df1 = r.docFreq(new Term(field, terms[0]));
int df2 = r.docFreq(new Term(field, terms[1]));
queriesOut.write("HighPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
queriesOut.write("HighSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
queriesOut.write("HighSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
upto++;
counter++;
if (counter >= NUM_QUERIES) {
break;
}
}
counter = 0;
while(topShingles[upto].df >= topDF/100) {
final TermFreq tf = topShingles[upto];
String [] terms = tf.term.utf8ToString().split(" ");
if (terms.length != 2) {
throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
}
int df1 = r.docFreq(new Term(field, terms[0]));
int df2 = r.docFreq(new Term(field, terms[1]));
queriesOut.write("MedPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
queriesOut.write("MedSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
queriesOut.write("MedSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
upto++;
counter++;
if (counter >= NUM_QUERIES) {
break;
}
}
counter = 0;
while(topShingles[upto].df >= topDF/1000) {
final TermFreq tf = topShingles[upto];
String [] terms = tf.term.utf8ToString().split(" ");
if (terms.length != 2) {
throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
}
int df1 = r.docFreq(new Term(field, terms[0]));
int df2 = r.docFreq(new Term(field, terms[1]));
queriesOut.write("LowPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
queriesOut.write("LowSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
queriesOut.write("LowSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
upto++;
counter++;
if (counter >= NUM_QUERIES) {
break;
}
}
queriesOut.flush();
}
}