package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.join.BlockJoinCollector;
import org.apache.lucene.search.join.BlockJoinQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestBlockJoin extends LuceneTestCase {
// One resume...
private Document makeResume(String name, String country) {
Document resume = new Document();
resume.add(newField("docType", "resume", Field.Index.NOT_ANALYZED));
resume.add(newField("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED));
resume.add(newField("country", country, Field.Index.NOT_ANALYZED));
return resume;
}
// ... has multiple jobs
private Document makeJob(String skill, int year) {
Document job = new Document();
job.add(newField("skill", skill, Field.Store.YES, Field.Index.NOT_ANALYZED));
job.add(new NumericField("year").setIntValue(year));
return job;
}
// ... has multiple qualifications
private Document makeQualification(String qualification, int year) {
Document job = new Document();
job.add(newField("qualification", qualification, Field.Store.YES, Field.Index.NOT_ANALYZED));
job.add(new NumericField("year").setIntValue(year));
return job;
}
public void testSimple() throws Exception {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
final List<Document> docs = new ArrayList<Document>();
docs.add(makeJob("java", 2007));
docs.add(makeJob("python", 2010));
docs.add(makeResume("Lisa", "United Kingdom"));
w.addDocuments(docs);
docs.clear();
docs.add(makeJob("ruby", 2005));
docs.add(makeJob("java", 2006));
docs.add(makeResume("Frank", "United States"));
w.addDocuments(docs);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = new IndexSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childQuery = new BooleanQuery();
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
// Define parent document criteria (find a resident in the UK)
Query parentQuery = new TermQuery(new Term("country", "United Kingdom"));
// Wrap the child document query to 'join' any matches
// up to corresponding parent:
BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
// Combine the parent and nested child queries into a single query for a candidate
BooleanQuery fullQuery = new BooleanQuery();
fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST));
BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false);
s.search(fullQuery, c);
TopGroups<Integer> results = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
//assertEquals(1, results.totalHitCount);
assertEquals(1, results.totalGroupedHitCount);
assertEquals(1, results.groups.length);
final GroupDocs<Integer> group = results.groups[0];
assertEquals(1, group.totalHits);
Document childDoc = s.doc(group.scoreDocs[0].doc);
//System.out.println(" doc=" + group.scoreDocs[0].doc);
assertEquals("java", childDoc.get("skill"));
assertNotNull(group.groupValue);
Document parentDoc = s.doc(group.groupValue);
assertEquals("Lisa", parentDoc.get("name"));
r.close();
dir.close();
}
public void testBoostBug() throws Exception {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
BlockJoinQuery q = new BlockJoinQuery(new MatchAllDocsQuery(), new QueryWrapperFilter(new MatchAllDocsQuery()), BlockJoinQuery.ScoreMode.Avg);
s.search(q, 10);
BooleanQuery bq = new BooleanQuery();
bq.setBoost(2f); // we boost the BQ
bq.add(q, BooleanClause.Occur.MUST);
s.search(bq, 10);
s.close();
r.close();
dir.close();
}
private String[][] getRandomFields(int maxUniqueValues) {
final String[][] fields = new String[_TestUtil.nextInt(random, 2, 4)][];
for(int fieldID=0;fieldID<fields.length;fieldID++) {
final int valueCount;
if (fieldID == 0) {
valueCount = 2;
} else {
valueCount = _TestUtil.nextInt(random, 1, maxUniqueValues);
}
final String[] values = fields[fieldID] = new String[valueCount];
for(int i=0;i<valueCount;i++) {
values[i] = _TestUtil.randomRealisticUnicodeString(random);
//values[i] = _TestUtil.randomSimpleString(random);
}
}
return fields;
}
private Term randomParentTerm(String[] values) {
return new Term("parent0", values[random.nextInt(values.length)]);
}
private Term randomChildTerm(String[] values) {
return new Term("child0", values[random.nextInt(values.length)]);
}
private Sort getRandomSort(String prefix, int numFields) {
final List<SortField> sortFields = new ArrayList<SortField>();
// TODO: sometimes sort by score; problem is scores are
// not comparable across the two indices
// sortFields.add(SortField.FIELD_SCORE);
if (random.nextBoolean()) {
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
} else if (random.nextBoolean()) {
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
}
// Break ties:
sortFields.add(new SortField(prefix + "ID", SortField.INT));
return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
}
public void testRandom() throws Exception {
// We build two indices at once: one normalized (which
// BlockJoinQuery/Collector can query) and the other w/
// same docs just fully denormalized:
final Directory dir = newDirectory();
final Directory joinDir = newDirectory();
final int numParentDocs = _TestUtil.nextInt(random, 100*RANDOM_MULTIPLIER, 300*RANDOM_MULTIPLIER);
//final int numParentDocs = 30;
// Values for parent fields:
final String[][] parentFields = getRandomFields(numParentDocs/2);
// Values for child fields:
final String[][] childFields = getRandomFields(numParentDocs);
// TODO: test star join, nested join cases too!
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir);
for(int parentDocID=0;parentDocID<numParentDocs;parentDocID++) {
Document parentDoc = new Document();
Document parentJoinDoc = new Document();
Field id = newField("parentID", ""+parentDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
parentDoc.add(id);
parentJoinDoc.add(id);
parentJoinDoc.add(newField("isParent", "x", Field.Index.NOT_ANALYZED));
for(int field=0;field<parentFields.length;field++) {
if (random.nextDouble() < 0.9) {
Field f = newField("parent" + field,
parentFields[field][random.nextInt(parentFields[field].length)],
Field.Index.NOT_ANALYZED);
parentDoc.add(f);
parentJoinDoc.add(f);
}
}
final List<Document> joinDocs = new ArrayList<Document>();
if (VERBOSE) {
System.out.println(" " + parentDoc);
}
final int numChildDocs = _TestUtil.nextInt(random, 1, 20);
for(int childDocID=0;childDocID<numChildDocs;childDocID++) {
// Denormalize: copy all parent fields into child doc:
Document childDoc = _TestUtil.cloneDocument(parentDoc);
Document joinChildDoc = new Document();
joinDocs.add(joinChildDoc);
Field childID = newField("childID", ""+childDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
childDoc.add(childID);
joinChildDoc.add(childID);
for(int childFieldID=0;childFieldID<childFields.length;childFieldID++) {
if (random.nextDouble() < 0.9) {
Field f = newField("child" + childFieldID,
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)],
Field.Index.NOT_ANALYZED);
childDoc.add(f);
joinChildDoc.add(f);
}
}
if (VERBOSE) {
System.out.println(" " + joinChildDoc);
}
w.addDocument(childDoc);
}
// Parent last:
joinDocs.add(parentJoinDoc);
joinW.addDocuments(joinDocs);
}
final IndexReader r = w.getReader();
w.close();
final IndexReader joinR = joinW.getReader();
joinW.close();
if (VERBOSE) {
System.out.println("TEST: reader=" + r);
System.out.println("TEST: joinReader=" + joinR);
for(int docIDX=0;docIDX<joinR.maxDoc();docIDX++) {
System.out.println(" docID=" + docIDX + " doc=" + joinR.document(docIDX));
}
}
final IndexSearcher s = new IndexSearcher(r);
s.setDefaultFieldSortScoring(true, true);
final IndexSearcher joinS = new IndexSearcher(joinR);
final Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "x"))));
final int iters = 200*RANDOM_MULTIPLIER;
for(int iter=0;iter<iters;iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + (1+iter) + " of " + iters);
}
final Query childQuery;
if (random.nextInt(3) == 2) {
final int childFieldID = random.nextInt(childFields.length);
childQuery = new TermQuery(new Term("child" + childFieldID,
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
} else if (random.nextInt(3) == 2) {
BooleanQuery bq = new BooleanQuery();
childQuery = bq;
final int numClauses = _TestUtil.nextInt(random, 2, 4);
boolean didMust = false;
for(int clauseIDX=0;clauseIDX<numClauses;clauseIDX++) {
Query clause;
BooleanClause.Occur occur;
if (!didMust && random.nextBoolean()) {
occur = random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT;
clause = new TermQuery(randomChildTerm(childFields[0]));
didMust = true;
} else {
occur = BooleanClause.Occur.SHOULD;
final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
clause = new TermQuery(new Term("child" + childFieldID,
childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
}
bq.add(clause, occur);
}
} else {
BooleanQuery bq = new BooleanQuery();
childQuery = bq;
bq.add(new TermQuery(randomChildTerm(childFields[0])),
BooleanClause.Occur.MUST);
final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
bq.add(new TermQuery(new Term("child" + childFieldID, childFields[childFieldID][random.nextInt(childFields[childFieldID].length)])),
random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
}
final BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
// To run against the block-join index:
final Query parentJoinQuery;
// Same query as parentJoinQuery, but to run against
// the fully denormalized index (so we can compare)
// results:
final Query parentQuery;
if (random.nextBoolean()) {
parentQuery = childQuery;
parentJoinQuery = childJoinQuery;
} else {
// AND parent field w/ child field
final BooleanQuery bq = new BooleanQuery();
parentJoinQuery = bq;
final Term parentTerm = randomParentTerm(parentFields[0]);
if (random.nextBoolean()) {
bq.add(childJoinQuery, BooleanClause.Occur.MUST);
bq.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
} else {
bq.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
bq.add(childJoinQuery, BooleanClause.Occur.MUST);
}
final BooleanQuery bq2 = new BooleanQuery();
parentQuery = bq2;
if (random.nextBoolean()) {
bq2.add(childQuery, BooleanClause.Occur.MUST);
bq2.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
} else {
bq2.add(new TermQuery(parentTerm),
BooleanClause.Occur.MUST);
bq2.add(childQuery, BooleanClause.Occur.MUST);
}
}
final Sort parentSort = getRandomSort("parent", parentFields.length);
final Sort childSort = getRandomSort("child", childFields.length);
if (VERBOSE) {
System.out.println("\nTEST: query=" + parentQuery + " joinQuery=" + parentJoinQuery + " parentSort=" + parentSort + " childSort=" + childSort);
}
// Merge both sorst:
final List<SortField> sortFields = new ArrayList<SortField>(Arrays.asList(parentSort.getSort()));
sortFields.addAll(Arrays.asList(childSort.getSort()));
final Sort parentAndChildSort = new Sort(sortFields.toArray(new SortField[sortFields.size()]));
final TopDocs results = s.search(parentQuery, null, r.numDocs(),
parentAndChildSort);
if (VERBOSE) {
System.out.println("\nTEST: normal index gets " + results.totalHits + " hits");
final ScoreDoc[] hits = results.scoreDocs;
for(int hitIDX=0;hitIDX<hits.length;hitIDX++) {
final Document doc = s.doc(hits[hitIDX].doc);
//System.out.println(" score=" + hits[hitIDX].score + " parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
System.out.println(" parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
FieldDoc fd = (FieldDoc) hits[hitIDX];
if (fd.fields != null) {
System.out.print(" ");
for(Object o : fd.fields) {
if (o instanceof BytesRef) {
System.out.print(((BytesRef) o).utf8ToString() + " ");
} else {
System.out.print(o + " ");
}
}
System.out.println();
}
}
}
final BlockJoinCollector c = new BlockJoinCollector(parentSort, 10, true, true);
joinS.search(parentJoinQuery, c);
final int hitsPerGroup = _TestUtil.nextInt(random, 1, 20);
//final int hitsPerGroup = 100;
final TopGroups<Integer> joinResults = c.getTopGroups(childJoinQuery, childSort, 0, hitsPerGroup, 0, true);
if (VERBOSE) {
System.out.println("\nTEST: block join index gets " + (joinResults == null ? 0 : joinResults.groups.length) + " groups; hitsPerGroup=" + hitsPerGroup);
if (joinResults != null) {
final GroupDocs<Integer>[] groups = joinResults.groups;
for(int groupIDX=0;groupIDX<groups.length;groupIDX++) {
final GroupDocs<Integer> group = groups[groupIDX];
if (group.groupSortValues != null) {
System.out.print(" ");
for(Object o : group.groupSortValues) {
if (o instanceof BytesRef) {
System.out.print(((BytesRef) o).utf8ToString() + " ");
} else {
System.out.print(o + " ");
}
}
System.out.println();
}
assertNotNull(group.groupValue);
final Document parentDoc = joinS.doc(group.groupValue);
System.out.println(" group parentID=" + parentDoc.get("parentID") + " (docID=" + group.groupValue + ")");
for(int hitIDX=0;hitIDX<group.scoreDocs.length;hitIDX++) {
final Document doc = joinS.doc(group.scoreDocs[hitIDX].doc);
//System.out.println(" score=" + group.scoreDocs[hitIDX].score + " childID=" + doc.get("childID") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
System.out.println(" childID=" + doc.get("childID") + " child0=" + doc.get("child0") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
}
}
}
}
if (results.totalHits == 0) {
assertNull(joinResults);
} else {
compareHits(r, joinR, results, joinResults);
}
}
r.close();
joinR.close();
dir.close();
joinDir.close();
}
private void compareHits(IndexReader r, IndexReader joinR, TopDocs results, TopGroups<Integer> joinResults) throws Exception {
// results is 'complete'; joinResults is a subset
int resultUpto = 0;
int joinGroupUpto = 0;
final ScoreDoc[] hits = results.scoreDocs;
final GroupDocs<Integer>[] groupDocs = joinResults.groups;
while(joinGroupUpto < groupDocs.length) {
final GroupDocs<Integer> group = groupDocs[joinGroupUpto++];
final ScoreDoc[] groupHits = group.scoreDocs;
assertNotNull(group.groupValue);
final Document parentDoc = joinR.document(group.groupValue);
final String parentID = parentDoc.get("parentID");
//System.out.println("GROUP groupDoc=" + group.groupDoc + " parent=" + parentDoc);
assertNotNull(parentID);
assertTrue(groupHits.length > 0);
for(int hitIDX=0;hitIDX<groupHits.length;hitIDX++) {
final Document nonJoinHit = r.document(hits[resultUpto++].doc);
final Document joinHit = joinR.document(groupHits[hitIDX].doc);
assertEquals(parentID,
nonJoinHit.get("parentID"));
assertEquals(joinHit.get("childID"),
nonJoinHit.get("childID"));
}
if (joinGroupUpto < groupDocs.length) {
// Advance non-join hit to the next parentID:
//System.out.println(" next joingroupUpto=" + joinGroupUpto + " gd.length=" + groupDocs.length + " parentID=" + parentID);
while(true) {
assertTrue(resultUpto < hits.length);
if (!parentID.equals(r.document(hits[resultUpto].doc).get("parentID"))) {
break;
}
resultUpto++;
}
}
}
}
public void testMultiChildTypes() throws Exception {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
final List<Document> docs = new ArrayList<Document>();
docs.add(makeJob("java", 2007));
docs.add(makeJob("python", 2010));
docs.add(makeQualification("maths", 1999));
docs.add(makeResume("Lisa", "United Kingdom"));
w.addDocuments(docs);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = new IndexSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childJobQuery = new BooleanQuery();
childJobQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
childJobQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
BooleanQuery childQualificationQuery = new BooleanQuery();
childQualificationQuery.add(new BooleanClause(new TermQuery(new Term("qualification", "maths")), Occur.MUST));
childQualificationQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 1980, 2000, true, true), Occur.MUST));
// Define parent document criteria (find a resident in the UK)
Query parentQuery = new TermQuery(new Term("country", "United Kingdom"));
// Wrap the child document query to 'join' any matches
// up to corresponding parent:
BlockJoinQuery childJobJoinQuery = new BlockJoinQuery(childJobQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
BlockJoinQuery childQualificationJoinQuery = new BlockJoinQuery(childQualificationQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
// Combine the parent and nested child queries into a single query for a candidate
BooleanQuery fullQuery = new BooleanQuery();
fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
fullQuery.add(new BooleanClause(childJobJoinQuery, Occur.MUST));
fullQuery.add(new BooleanClause(childQualificationJoinQuery, Occur.MUST));
//????? How do I control volume of jobs vs qualifications per parent?
BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 10, true, false);
s.search(fullQuery, c);
//Examine "Job" children
boolean showNullPointerIssue=true;
if (showNullPointerIssue) {
TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
//assertEquals(1, results.totalHitCount);
assertEquals(1, jobResults.totalGroupedHitCount);
assertEquals(1, jobResults.groups.length);
final GroupDocs<Integer> group = jobResults.groups[0];
assertEquals(1, group.totalHits);
Document childJobDoc = s.doc(group.scoreDocs[0].doc);
//System.out.println(" doc=" + group.scoreDocs[0].doc);
assertEquals("java", childJobDoc.get("skill"));
assertNotNull(group.groupValue);
Document parentDoc = s.doc(group.groupValue);
assertEquals("Lisa", parentDoc.get("name"));
}
//Now Examine qualification children
TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true);
//!!!!! This next line can null pointer - but only if prior "jobs" section called first
assertEquals(1, qualificationResults.totalGroupedHitCount);
assertEquals(1, qualificationResults.groups.length);
final GroupDocs<Integer> qGroup = qualificationResults.groups[0];
assertEquals(1, qGroup.totalHits);
Document childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc);
assertEquals("maths", childQualificationDoc.get("qualification"));
assertNotNull(qGroup.groupValue);
Document parentDoc = s.doc(qGroup.groupValue);
assertEquals("Lisa", parentDoc.get("name"));
r.close();
dir.close();
}
}