package org.cdlib.xtf.textEngine.facet;
/**
* Copyright (c) 2006, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.WeakHashMap;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositions;
import org.cdlib.xtf.util.IntMultiMap;
import org.cdlib.xtf.util.TagArray;
import org.cdlib.xtf.util.Trace;
/**
* Keeps a large in-memory table of the title, author, and other info for each
* document.
*
* @author Martin Haye
*/
public class FRBRData
{
/** Cached data. If the reader goes away, our cache will too. */
private static WeakHashMap cache = new WeakHashMap();
public static final int FIRST_TYPE = 1;
public static final int TYPE_TITLE = 1;
public static final int TYPE_AUTHOR = 2;
public static final int TYPE_DATE = 3;
public static final int TYPE_ID = 4;
public static final int LAST_TYPE = 4;
public final TagArray tags;
public final IntMultiMap docTags;
public final IntMultiMap tagDocs;
/**
* Retrieves tags for a given set of fields from a given reader. Maintains a cache
* so that if the same fields are requested again for this reader, we don't have
* to re-read the tags.
*
* @param reader Where to read the tags from
* @param fields Which fields to read
* @return FRBR tags for the specified fields
*/
public static FRBRData getCachedTags(IndexReader reader, String[] fields)
throws IOException
{
// See if we have a cache for this reader.
HashMap readerCache = (HashMap)cache.get(reader);
if (readerCache == null) {
readerCache = new HashMap();
cache.put(reader, readerCache);
}
StringBuffer buf = new StringBuffer();
for (int i = 0; i < fields.length; i++)
buf.append(fields[i] + "|");
String allFields = buf.toString();
// Now see if we've already read data for this set of fields.
FRBRData tags = (FRBRData)readerCache.get(allFields);
if (tags == null)
{
// Don't have cached data, so read and remember it.
tags = new FRBRData(reader, fields);
readerCache.put(allFields, tags);
}
return tags;
} // getCachedTags()
/**
* Read tags for a given set of fields from the given reader. Do not construct
* directly, but rather use {@link #getCachedTags(IndexReader, String[])}.
*/
private FRBRData(IndexReader reader, String[] fields)
throws IOException
{
long startTime = System.currentTimeMillis();
Trace.debug("Loading FRBR data...");
Trace.tab();
// First, allocate the tag array and all our types
tags = new TagArray();
// Add all our types
int tt;
tt = tags.findType("title");
assert tt == TYPE_TITLE;
tt = tags.findType("author");
assert tt == TYPE_AUTHOR;
tt = tags.findType("date");
assert tt == TYPE_DATE;
tt = tags.findType("id");
assert tt == TYPE_ID;
// Next, allocate the mapping from document to tag.
int maxDoc = reader.maxDoc();
docTags = new IntMultiMap(maxDoc);
// Read in each field.
for (int i = 0; i < fields.length; i++)
{
String field = fields[i];
// Identify the type
int type = calcType(field);
// Read all the data, and add it to the map.
Trace.debug("Reading FRBR field " + field + "...");
long prevTagSize = tags.byteSize();
int nTagsAdded = readField(reader, field, type);
Trace.debug(
"..." + nTagsAdded + " tags; " + (tags.byteSize() - prevTagSize) +
" bytes.");
} // for each field
// Now construct the inverse mapping, from tag to document.
Trace.debug("Inverting FRBR map...");
tagDocs = new IntMultiMap(tags.size());
for (int doc = 0; doc < maxDoc; doc++) {
for (int link = docTags.firstPos(doc); link >= 0;
link = docTags.nextPos(link))
tagDocs.add(docTags.getValue(link), doc);
}
// Reverse the order of the docTags map so that iterations come out right.
docTags.reverseOrder();
// Display some statistics.
Trace.debug(
"Done. Size = " + tags.byteSize() + " tags, " + tagDocs.byteSize() +
" map = " + (tags.byteSize() + tagDocs.byteSize()) + " total.");
Trace.debug(
"Time: " +
DecimalFormat.getInstance().format(
(System.currentTimeMillis() - startTime) / 1000.0f) + " sec");
Trace.untab();
} // constructor
/**
* Read all the term->document mappings from a given field, and add them to
* the tag array, and docTags mapping.
*
* @return number of tags added
*/
private int readField(IndexReader reader, String field, int type)
throws IOException
{
TermPositions termPositions = reader.termPositions();
TermEnum termEnum = reader.terms(new Term(field, ""));
try
{
if (termEnum.term() == null)
throw new IOException("no terms in field " + field);
int nTagsAdded = 0;
do
{
Term term = termEnum.term();
if (!term.field().equals(field))
break;
// Add a tag for this term.
int tag = addTag(term.text(), type);
// Reject data we don't care about.
if (tag < 0)
continue;
else
++nTagsAdded;
// Now process each document which contains this term.
termPositions.seek(termEnum);
while (termPositions.next()) {
int doc = termPositions.doc();
docTags.add(doc, tag);
}
} while (termEnum.next());
return nTagsAdded;
}
finally {
termPositions.close();
termEnum.close();
}
} // readField()
/**
* Add a tag for the given term and type.
*
* @param term Term to parse and add
* @param type Type to add the tag under
*/
private int addTag(String term, int type)
{
// First, strip off the sub-type, if any
term = term.trim();
int subType = 0;
if (term.endsWith("]"))
{
int start = term.lastIndexOf('[');
if (start >= 0)
{
try {
String sub = term.substring(start + 1, term.length() - 1);
subType = Integer.parseInt(sub);
}
catch (NumberFormatException e) {
}
term = term.substring(0, start).trim();
}
}
// Now perform special subType processing depending on main type
switch (type)
{
case TYPE_TITLE:
if (subType == 0)
subType = 245;
else if (subType != 245)
return -1;
break;
case TYPE_AUTHOR:
if (subType == 0)
subType = 100;
else if (subType != 100 && subType != 700)
return -1;
break;
case TYPE_ID:
if (subType == 0)
subType = 35;
else if (subType != 35 && subType != 900 && subType != 901)
return -1;
// If there isn't any substance to this identifier, skip it.
if (term.indexOf('(') < 6)
return -1;
break;
case TYPE_DATE:
break;
}
// Ready to add the tag.
return tags.add(term, type, subType);
} // addTag()
/**
* Calculate the type of a given field, based on the field name.
*/
private int calcType(String field)
throws IOException
{
if (field.indexOf("title") >= 0)
return TYPE_TITLE;
else if (field.indexOf("author") >= 0 || field.indexOf("creator") >= 0)
return TYPE_AUTHOR;
else if (field.indexOf("date") >= 0 || field.indexOf("year") >= 0)
return TYPE_DATE;
else if (field.indexOf("id") >= 0 || field.indexOf("ID") >= 0)
return TYPE_ID;
else
throw new IOException("Unable to identify field type: '" + field + "'");
}
} // class FRBRTags