package org.cdlib.xtf.textEngine.facet;
/*
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.util.Arrays;
import org.apache.lucene.util.PriorityQueue;
import org.cdlib.xtf.textEngine.DocHit;
import org.cdlib.xtf.textEngine.DocHitImpl;
/**
* Maintains an ongoing count of groups and how many document hits were
* found in each group.
*
* @author Martin Haye
*/
public class GroupCounts
{
private GroupData data;
private FacetSpec spec;
private HitQueueMaker hitQueueMaker;
private boolean prepMode = false;
private int[] count;
private float[] score;
private int[] mark;
private int[] selection;
private int[] startDoc;
private int[] maxDocs;
private PriorityQueue[] hitQueue;
private int[] sortedChild;
private int[] sortedSibling;
private int curMark = 1000;
private static final int SORT_BY_VALUE = 0;
private static final int SORT_BY_REVERSE_VALUE = 1;
private static final int SORT_BY_TOTAL_DOCS = 2;
private static final int SORT_BY_MAX_DOC_SCORE = 3;
/** Construct an object with all counts at zero */
public GroupCounts(GroupData groupData, FacetSpec spec,
HitQueueMaker hitQueueMaker)
{
// Record the input parameters for later use
this.data = groupData;
this.spec = spec;
this.hitQueueMaker = hitQueueMaker;
// Allocate our arrays of counts and such
if (!data.isDynamic()) {
count = new int[data.nGroups()];
score = new float[data.nGroups()];
}
mark = new int[data.nGroups()];
selection = new int[data.nGroups()];
startDoc = new int[data.nGroups()];
maxDocs = new int[data.nGroups()];
hitQueue = new PriorityQueue[data.nGroups()];
// For dynamic data, we can perform the final sort and selection
// right now, since the group counts and scores are known.
//
if (data.isDynamic())
sortAndSelect();
// For static data, make a conservative selection.
else
conservativePrep();
} // constructor
/** Gather data about which groups to gather DocHits for. */
private void conservativePrep()
{
// Enter prep mode so that when selectGroup() and collectHits() are called,
// they'll know what to do.
//
try
{
prepMode = true;
// Tell the selector to talk to us.
GroupSelector sel = spec.groupSelector;
synchronized (sel)
{
// Tell the selector to talk to us.
sel.setCounts(this);
//
// Tell the selector to be conservative in choosing which groups
// to select.
//
sel.reset(true);
// Now ask it to select everything (start it out with the root)
sel.process(0);
sel.flush();
}
}
finally {
// Exit prep mode, no matter what.
prepMode = false;
}
} // prep()
/** Called by GroupSelector to select a given group */
public final void selectGroup(int group)
{
if (prepMode)
return;
// Select the group, and put a secondary selection on each ancestor (up to
// the root)
//
boolean first = true;
for (; group >= 0; group = data.parent(group)) {
if (first)
selection[group] = 1;
else if (selection[group] == 0)
selection[group] = 2;
else
break;
} // for
} // selectGroup()
/** Called by GroupSelector to mark groups to receive documents */
public final void gatherDocs(int group, int startDoc, int maxDocs) {
this.startDoc[group] = startDoc;
this.maxDocs[group] = maxDocs;
}
/** Called by GroupSelector to find out if the ordering is non-default */
public final boolean nondefaultSort() {
return !spec.sortGroupsBy.equals("value");
}
/** Called by GroupSelector to find out if it should include a given group */
public final boolean shouldInclude(int group)
{
if (!spec.includeEmptyGroups)
{
if (data.isDynamic()) {
if (data.nDocHits(group) == 0)
return false;
}
else if (!prepMode && count[group] == 0)
return false;
}
return true;
}
/** Get the total number of groups */
public final int nGroups() {
return data.nGroups();
}
/** Get the first child of the given group, in properly sorted order */
public final int child(int group) {
if (sortedChild != null)
return sortedChild[group];
return data.child(group);
}
/** Get the next sibling of the given group, in properly sorted order */
public final int sibling(int group) {
if (sortedSibling != null)
return sortedSibling[group];
return data.sibling(group);
}
/** Get the parent of the given group */
public final int parent(int group) {
return data.parent(group);
}
/** Get the name of a specific group */
public final String name(int group) {
return data.name(group);
}
/** Find out whether the given group is selected */
public final boolean isSelected(int group) {
return selection[group] == 1;
}
/** Find out the number of doc hits for the given group */
public final int nDocHits(int group) {
if (data.isDynamic())
return data.nDocHits(group);
return count[group];
}
/** Find out the score of the given group */
public final float score(int group) {
if (data.isDynamic())
return data.score(group);
return score[group];
}
/** Add a document hit to the counts */
public void addDoc(DocHitMaker docHitMaker)
{
int link;
int group;
// Use a unique mark for each doc.
curMark++;
// Process each group this document is in.
int doc = docHitMaker.getDocNum();
float docScore = docHitMaker.getScore();
for (link = data.firstLink(doc); link >= 0; link = data.nextLink(link))
{
// Bump the count for the group and each ancestor (up to the root)
for (group = data.linkGroup(link); group >= 0;
group = data.parent(group))
{
// Don't count the same doc twice for one group.
if (mark[group] == curMark)
break;
// Bump the count, and mark this group so we don't do it for this
// doc again.
//
if (!data.isDynamic()) {
count[group]++;
score[group] = Math.max(score[group], docScore);
}
mark[group] = curMark;
// If we're not recording hits for this group, we're done.
if (maxDocs[group] == 0)
continue;
// Create a DocHitQueue if not done yet.
if (hitQueue[group] == null) {
hitQueue[group] = hitQueueMaker.makeQueue(
startDoc[group] + maxDocs[group]);
}
// And add this document to the hit queue.
docHitMaker.insertInto(hitQueue[group]);
} // for group
} // for link
} // addDoc()
/**
* Retrieve the result facet with its groupings.
*/
public ResultFacet getResult()
{
// Create an empty result to start with
ResultFacet resultFacet = new ResultFacet();
resultFacet.field = data.field();
// For dynamic facets, the groups have already been sorted and selected.
// For static facets, we don't know until this point what the counts and
// such are, so we couldn't make the final selection.
//
if (!data.isDynamic())
sortAndSelect();
// Recursively build the result set.
resultFacet.rootGroup = buildResultGroup(0);
// All done.
return resultFacet;
}
/**
* Called during the prep phase for dynamic groups, and in the result
* building phase for static groups. Sorts the groups based on the
* facet spec, and performs the final (non-conservative) selection.
*/
private void sortAndSelect()
{
// Clear the startDoc/maxDocs arrays so we can rebuild them knowing now
// exactly which groups need docs (we had to be conservative up front.)
//
Arrays.fill(startDoc, 0);
Arrays.fill(maxDocs, 0);
// Sort the groups (if necessary)
sortGroups();
// Now select the proper groups.
GroupSelector sel = spec.groupSelector;
synchronized (sel) {
sel.setCounts(this);
sel.reset(false); // not conservative, since all is sorted now.
sel.process(0);
sel.flush();
}
}
public ResultGroup buildResultGroup(int parent)
{
// Make a place for the result
ResultGroup result = new ResultGroup();
// Record the value of the parent group.
if (parent != 0)
result.value = data.name(parent);
// Record the total number of doc hits for the parent group
result.totalDocs = nDocHits(parent);
// Count the child groups
int nSelected = 0;
for (int kid = child(parent); kid >= 0; kid = sibling(kid)) {
if (!shouldInclude(kid))
continue;
++result.totalSubGroups;
if (selection[kid] != 0)
++nSelected;
}
// Build an array of the child groups.
if (nSelected > 0)
result.subGroups = new ResultGroup[nSelected];
int rank = 0;
int n = 0;
for (int kid = child(parent); kid >= 0; kid = sibling(kid))
{
if (!shouldInclude(kid))
continue;
if (selection[kid] != 0) {
result.subGroups[n] = buildResultGroup(kid);
result.subGroups[n].rank = rank;
n++;
}
++rank;
}
assert n == nSelected : "miscount";
// If DocHits were requested for this group, grab them.
if (maxDocs[parent] != 0 && hitQueue[parent] != null)
buildDocHits(parent, result);
// All done!
return result;
} // getGroups()
/** Re-sort the hierarchy according to the facet spec, and store the new
* child/sibling relationships.
*/
private void sortGroups()
{
// Figure out what kind of sort was requested.
int sortKind;
if (spec.sortGroupsBy.equals("value"))
sortKind = SORT_BY_VALUE;
else if (spec.sortGroupsBy.equals("reverseValue"))
sortKind = SORT_BY_REVERSE_VALUE;
else if (spec.sortGroupsBy.equals("totalDocs"))
sortKind = SORT_BY_TOTAL_DOCS;
else if (spec.sortGroupsBy.equals("maxDocScore"))
sortKind = SORT_BY_MAX_DOC_SCORE;
else
throw new RuntimeException(
"Unknown option for sortGroupsBy: " + spec.sortGroupsBy);
// For static data, the groups are already sorted by name.
if (!data.isDynamic() && sortKind == SORT_BY_VALUE)
return;
// Allocate storage for sorted child/sibling links
int nBefore = countDescendants(0);
sortedChild = new int[data.nGroups()];
sortedSibling = new int[data.nGroups()];
Arrays.fill(sortedChild, -1);
Arrays.fill(sortedSibling, -1);
// Okay, do a recursive merge sort, starting at the root.
sortChildren(0, sortKind);
// Verify that we didn't lose anybody in the sort.
int nAfter = countDescendants(0);
assert nAfter == nBefore : "mis-count on sort";
} // sortGroups()
/** Utility function to count the group and all of its descendants */
private int countDescendants(int group)
{
int count = 1; // for this group itself
for (int kid = child(group); kid >= 0; kid = sibling(kid))
count += countDescendants(kid);
return count;
}
/** Construct the array of doc hits for the hit group. */
private void buildDocHits(int group, ResultGroup resultGroup)
{
PriorityQueue queue = hitQueue[group];
int nFound = queue.size();
DocHitImpl[] hitArray = new DocHitImpl[nFound];
for (int i = 0; i < nFound; i++) {
int index = nFound - i - 1;
hitArray[index] = (DocHitImpl)queue.pop();
}
int start = startDoc[group];
int max = maxDocs[group];
int nHits = Math.max(0, Math.min(nFound - start, max));
resultGroup.docHits = new DocHit[nHits];
resultGroup.totalDocs = nDocHits(group);
resultGroup.startDoc = start;
resultGroup.endDoc = start + nHits;
for (int i = startDoc[group]; i < nFound; i++)
resultGroup.docHits[i - start] = hitArray[i];
} // buildDocHits()
public static interface HitQueueMaker {
PriorityQueue makeQueue(int size);
}
public static interface DocHitMaker
{
int getDocNum();
float getScore();
boolean insertInto(PriorityQueue queue);
}
/*
* The following code is adapted from a super-cool linked list mergesort
* algorithm by Simon Tatham. The code appears to be unrestricted, and
* was obtained from this URL on 7/25/2006:
*
* http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.html
*
* Following is the original rights statement.
*
* This file is copyright 2001 Simon Tatham.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL SIMON TATHAM BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
private void sortChildren(int parent, int sortKind)
{
int first = data.child(parent);
int p;
int q;
// If no children, we have nothing to do.
if (first < 0)
return;
// Initialize the links.
int nChildrenBefore = 0;
for (p = first; p >= 0; p = q) {
q = data.sibling(p);
sortedSibling[p] = q;
++nChildrenBefore;
}
// Merge lists of size 1, 2, 4, 8, 16, ...
int insize = 1;
while (true)
{
p = first;
first = -1;
int tail = -1;
int nmerges = 0; /* count number of merges we do in this pass */
while (p >= 0)
{
nmerges++; /* there exists a merge to be done */
/* step `insize' places along from p */
q = p;
int psize = 0;
for (int i = 0; i < insize; i++) {
psize++;
q = sortedSibling[q];
if (q < 0)
break;
}
/* if q hasn't fallen off end, we have two lists to merge */
int qsize = insize;
/* now we have two lists; merge them */
while (psize > 0 || (qsize > 0 && q >= 0))
{
/* decide whether next element of merge comes from p or q */
int e;
if (psize == 0)
{
/* p is empty; e must come from q. */
e = q;
q = sortedSibling[q];
qsize--;
}
else if (qsize == 0 || q < 0)
{
/* q is empty; e must come from p. */
e = p;
p = sortedSibling[p];
psize--;
}
else if (compare(p, q, sortKind) <= 0)
{
/* First element of p is lower (or same);
* e must come from p. */
e = p;
p = sortedSibling[p];
psize--;
}
else
{
/* First element of q is lower; e must come from q. */
e = q;
q = sortedSibling[q];
qsize--;
}
/* add the next element to the merged list */
if (tail >= 0)
sortedSibling[tail] = e;
else
first = e;
tail = e;
}
/* now p has stepped `insize' places along, and q has too */
p = q;
}
sortedSibling[tail] = -1;
/* If we have done only one merge, we're finished. */
if (nmerges <= 1) /* allow for nmerges==0, the empty list case */
break;
/* Otherwise repeat, merging lists twice the size */
insize *= 2;
}
// Record the first child.
sortedChild[parent] = first;
// Sort all the descendants of the children, and verify the sort.
for (p = first; p >= 0; p = sortedSibling[p])
sortChildren(p, sortKind);
int nChildrenAfter = 0;
for (p = sortedChild[parent]; p >= 0; p = sortedSibling[p]) {
if (sortedSibling[p] >= 0)
assert compare(p, sortedSibling[p], sortKind) <= 0 : "error in merge sort";
++nChildrenAfter;
} // for
assert nChildrenAfter == nChildrenBefore;
} // sortChildren()
/*
* End of adapted code.
*/
/**
* Compare two groups for sorting purposes.
*/
private int compare(int g1, int g2, int sortKind)
{
int x;
switch (sortKind) {
case SORT_BY_VALUE:
if ((x = data.compare(g1, g2)) != 0)
return x;
if ((x = -compare(score(g1), score(g2))) != 0)
return x;
if ((x = -compare(nDocHits(g1), nDocHits(g2))) != 0)
return x;
return 0;
case SORT_BY_REVERSE_VALUE:
if ((x = -data.compare(g1, g2)) != 0)
return x;
if ((x = -compare(score(g1), score(g2))) != 0)
return x;
if ((x = -compare(nDocHits(g1), nDocHits(g2))) != 0)
return x;
return 0;
case SORT_BY_TOTAL_DOCS:
if ((x = -compare(nDocHits(g1), nDocHits(g2))) != 0)
return x;
if ((x = data.compare(g1, g2)) != 0)
return x;
if ((x = -compare(score(g1), score(g2))) != 0)
return x;
return 0;
case SORT_BY_MAX_DOC_SCORE:
if ((x = -compare(score(g1), score(g2))) != 0)
return x;
if ((x = data.compare(g1, g2)) != 0)
return x;
if ((x = -compare(nDocHits(g1), nDocHits(g2))) != 0)
return x;
return 0;
default:
return 0;
}
} // compare()
/** Compare two ints for sorting purposes */
private static int compare(int x, int y) {
return (x < y) ? -1 : ((x > y) ? 1 : 0);
}
/** Compare two floats for sorting purposes */
private static int compare(float x, float y) {
return (x < y) ? -1 : ((x > y) ? 1 : 0);
}
} // class GroupCounts