package org.cdlib.xtf.textEngine.facet;
/*
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Vector;
import java.util.WeakHashMap;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.TermEnum;
/**
* This class contains the mapping, for a given field, from documents to
* one or more term values in that document.
*
* @author Martin Haye
*/
public class StaticGroupData extends GroupData
{
/** The particular field we have data from */
private String field;
/** Array of document IDs */
private int[] docs;
/**
* Array of links: 0..docs.length is either positive to indicate a single group
* for this doc, or negative to indicate a link later in the array to a list
* of groups. docs.length..links.length holds the extra groups; each entry is
* a group number, negative to mean end of the groups for a single doc.
*/
private int[] links;
/** Array of group names */
private String[] groups;
/** The parent of each group, or -1 for none */
private int[] groupParents;
/** The first child of each group, or -1 for none. */
private int[] groupChildren;
/** The next sibling of each group, or -1 for none. */
private int[] groupSiblings;
/** Cached data. If the reader goes away, our cache will too. */
private static WeakHashMap cache = new WeakHashMap();
/**
* Retrieves GroupData for a given field from a given reader. Maintains a cache
* so that if the same field is requested again for this reader, we don't have
* to re-read the group data.
*
* Synchronized so that if a bunch of threads come in wanting to load the same
* data, we won't waste time and memory loading it over and over.
*
* @param reader Where to read the data from
* @param field Which field to read
* @return Group data for the specified field
*/
public static synchronized StaticGroupData getCachedData(IndexReader reader, String field)
throws IOException
{
// See if we have a cache for this reader.
HashMap readerCache = (HashMap)cache.get(reader);
if (readerCache == null) {
readerCache = new HashMap();
cache.put(reader, readerCache);
}
// Now see if we've already read data for this field.
StaticGroupData data = (StaticGroupData)readerCache.get(field);
if (data == null)
{
// Don't have cached data, so read and remember it.
data = new StaticGroupData(reader, field);
readerCache.put(field, data);
}
return data;
} // getCachedData()
/**
* Read in the term data for a given field, and build up the various arrays
* of document to group info, and hierarchical relationships between the
* groups.
*
* @param reader Where to read the term data from
* @param field Which field to read
*/
public StaticGroupData(IndexReader reader, String field)
throws IOException
{
this.field = field;
TermPositions termPositions = reader.termPositions();
TermEnum termEnum = reader.terms(new Term(field, ""));
HashMap groupMap = new HashMap();
Vector groupVec = new Vector();
HashMap childMap = new HashMap();
HashMap docMap = new HashMap();
HashSet lcTerms = new HashSet();
int nLinks = 0;
// Add a default root group.
groupVec.add("".intern());
groupMap.put("".intern(), Integer.valueOf(0));
// Make an entry for each document and each term. Ensure that
// there is only one term in this field per document.
//
try
{
if (termEnum.term() == null)
throw new RuntimeException("no terms in field " + field);
do
{
Term term = termEnum.term();
if (!term.field().equals(field))
break;
// If we've seen this term before, skip it. This can happen if
// the real term was mixed case, and we encounter the lower-case
// version later.
//
String lcTerm = term.text().toLowerCase();
if (lcTerms.contains(lcTerm))
continue;
lcTerms.add(lcTerm);
// Add a group key for this term. Also, if it's hierarchical,
// find the ancestor groups and add them to the child map.
//
String termText = term.text();
if (termText.length() == 0)
termText = "<empty>";
Integer termKey = addTermKey(termText, groupVec, groupMap, childMap);
// Now process each document which contains this term.
termPositions.seek(termEnum);
while (termPositions.next())
{
// Get or create a vector for this document.
int docId = termPositions.doc();
Integer docKey = Integer.valueOf(docId);
Vector docGroups = (Vector)docMap.get(docKey);
if (docGroups == null) {
docGroups = new Vector(1);
docMap.put(docKey, docGroups);
}
// If we're going from one group to two, add extra link space
// for the initial link.
//
if (docGroups.size() == 1)
nLinks++;
docGroups.add(termKey);
nLinks++;
} // while( termPositions.next() )
} while (termEnum.next());
}
finally {
termPositions.close();
termEnum.close();
}
// Build the final array of groups. Basically we just take the last
// component of each path.
//
groups = (String[])groupVec.toArray(new String[groupVec.size()]);
for (int i = 0; i < groups.length; i++) {
int lastSep = groups[i].lastIndexOf("::");
if (lastSep >= 0)
groups[i] = groups[i].substring(lastSep + 2);
}
// Build the group parent/child/sibling tables.
buildHierarchy(childMap);
// Now we're ready to build our final arrays that condense all the
// document -> group information.
//
docs = new int[docMap.size()];
links = new int[nLinks];
buildLinks(docMap);
} // constructor
/**
* Add the given term to the group vector and map. If it's hierarchical,
* add relationships for the parent and all ancestors as well.
*
* @param termText Term to add
* @param groupVec Vector of groups in sort order
* @param groupMap Mapping of terms to group numbers
* @param childMap Mapping of parent key to child vector
* @return New key for the term
*/
private Integer addTermKey(String termText, Vector groupVec,
HashMap groupMap, HashMap childMap)
{
String curName = termText;
Integer childKey = null;
Integer termKey = null;
while (true)
{
// Find or make a key for the current name.
String parentName = curName.intern();
Integer parentKey = (Integer)groupMap.get(parentName);
if (parentKey == null) {
parentKey = Integer.valueOf(groupVec.size());
groupVec.add(parentName);
groupMap.put(parentName, parentKey);
}
// If this is the first go-round, record the new key.
if (termKey == null)
termKey = parentKey;
// On the second and subsequent go-rounds, record the relationship
// between the parent and its child
//
else
{
HashSet parentChildSet = (HashSet)childMap.get(parentKey);
if (parentChildSet == null) {
parentChildSet = new HashSet();
childMap.put(parentKey, parentChildSet);
}
parentChildSet.add(childKey);
}
// Stop when we reach the root.
if (curName.length() == 0)
break;
// Go up one level in the hierarchy.
childKey = parentKey;
int lastColon = curName.lastIndexOf("::");
if (lastColon >= 0)
curName = curName.substring(0, lastColon);
else
curName = "";
}
// Return the first key we made (for the term itself, not its ancestors.)
return termKey;
} // addTermKey()
/**
* Based on a hierarchy data map, build the parent, child, and sibling
* relationship arrays that make all this info easy to find and fast to
* traverse.
*
* @param childMap Map of parent key to vector of child keys
*/
private void buildHierarchy(HashMap childMap)
{
groupParents = new int[groups.length];
Arrays.fill(groupParents, -1);
groupChildren = new int[groups.length];
Arrays.fill(groupChildren, -1);
groupSiblings = new int[groups.length];
Arrays.fill(groupSiblings, -1);
for (Iterator iter = childMap.keySet().iterator(); iter.hasNext();)
{
Integer parentKey = (Integer)iter.next();
int parent = parentKey.intValue();
HashSet childSet = (HashSet)childMap.get(parentKey);
assert groupChildren[parent] < 0 : "multiple child lists for parent";
int prev = -1;
ArrayList children = new ArrayList(childSet);
Collections.sort(children);
for (int i = 0; i < children.size(); i++) {
int child = ((Integer)children.get(i)).intValue();
groupParents[child] = parent;
assert child != prev;
if (prev < 0)
groupChildren[parent] = child;
else
groupSiblings[prev] = child;
prev = child;
}
}
} // buildHierarchy()
/**
* Perform the final build step, forming the 'docs' and 'links' arrays.
*
* @param docMap Map of document ID to vector of group IDs
*/
private void buildLinks(HashMap docMap)
{
// Get an array of all the documents, sorted by document ID.
ArrayList keyList = new ArrayList(docMap.keySet());
assert keyList.size() == docs.length : "incorrect calculation";
Collections.sort(keyList);
int topLink = docs.length;
for (int i = 0; i < docs.length; i++)
{
Integer docKey = (Integer)keyList.get(i);
Vector docGroups = (Vector)docMap.get(docKey);
int docNum = docKey.intValue();
docs[i] = docNum;
// Two cases. If there's only one group, record it directly. Otherwise,
// record a link to a list of groups.
//
if (docGroups.size() == 1)
links[i] = ((Integer)docGroups.get(0)).intValue();
else
{
links[i] = -topLink;
for (Iterator iter = docGroups.iterator(); iter.hasNext();) {
int groupNum = ((Integer)iter.next()).intValue();
if (!iter.hasNext())
groupNum = -groupNum;
links[topLink++] = groupNum;
}
}
}
// We should have just the right number of links; no more, no less.
assert topLink == links.length : "incorrect calculation";
} // buildLinks()
/**
* Return the ID of the first link for the given document, or -1 if there
* are no links for that document.
*
* @param docId document to look for
* @return the first link ID, or -1 if none
*/
public final int firstLink(int docId)
{
int found = Arrays.binarySearch(docs, docId);
if (found < 0 || found >= docs.length)
return -1;
if (links[found] >= 0)
return found;
else
return -links[found];
} // getDocLink()
/** Return the ID of the link after the specified one, or -1 if no more */
public final int nextLink(int linkId) {
if (linkId < docs.length)
return -1;
else if (links[linkId] < 0)
return -1;
else
return linkId + 1;
} // getNextLink()
/** Returns the group number of the specified link */
public final int linkGroup(int linkId) {
int n = links[linkId];
return (n < 0) ? -n : n;
} // getLinkGroup()
/** Get the name of the grouping field */
public final String field() {
return field;
}
/** Get the total number of groups */
public final int nGroups() {
return groups.length;
}
/** Get the name of a group given its number */
public final String name(int groupId) {
return groups[groupId];
}
/** Get the parent of the given group, or -1 if group is the root */
public final int parent(int groupId) {
return groupParents[groupId];
}
/** Get the number of children a group has */
public final int nChildren(int groupId) {
int nChildren = 0;
for (int kid = groupChildren[groupId]; kid >= 0;
kid = groupSiblings[kid])
nChildren++;
return nChildren;
}
/** Get the first child of the given group, or -1 if it has no children */
public final int child(int groupId) {
return groupChildren[groupId];
}
/** Get the sibling of the given group, or -1 if no more */
public final int sibling(int groupId) {
return groupSiblings[groupId];
}
/** Compare two groups for sort order */
public final int compare(int group1, int group2) {
return (group1 < group2) ? -1 : ((group1 > group2) ? 1 : 0);
}
/** Locate a group by name and return its index, or -1 if not found */
public final int findGroup(String name)
{
name = name.intern();
for (int i = 0; i < groups.length; i++) {
if (name == groups[i])
return i;
}
return -1;
}
} // GroupData