package org.cdlib.xtf.textEngine;
/*
* Copyright (c) 2005, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.WeakHashMap;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositions;
import org.cdlib.xtf.util.Trace;
/**
* Holds a set of boost factors to apply to individual documents in the
* document set.
*
* @author Martin Haye
*/
public class BoostSet
{
/** Cached data. If the reader goes away, our cache will too. */
private static WeakHashMap cache = new WeakHashMap();
/** Field to find document keys in */
private String field;
/** Number of warnings emitted so far. After 10, we suppress them. */
private int nWarnings = 0;
/** Set of boost values, one per document ID */
private float[] boostByDoc;
/** Marker for the default value */
private static float DEFAULT_MARKER = -99.0f;
/**
* Retrieves BoostSet for a given File from a given reader. Maintains a cache
* so that if the same File is requested again for this reader, we don't have
* to re-read the boost data.
*
* @param indexReader Index to correlate the data to
* @param inFile Which file to read
* @param field Field used to key boost values
* @return Group data for the specified field
*/
public static BoostSet getCachedSet(IndexReader indexReader, File inFile,
String field)
throws IOException
{
// See if we have a cache for this reader.
HashMap readerCache = (HashMap)cache.get(indexReader);
if (readerCache == null) {
readerCache = new HashMap();
cache.put(indexReader, readerCache);
}
// Now see if we've already read data for this field.
BoostSet set = (BoostSet)readerCache.get(inFile);
if (set == null || !set.field.equals(field))
{
// Don't have cached data, so read and remember it.
set = new BoostSet(indexReader, inFile, field);
readerCache.put(inFile, set);
}
return set;
} // getCachedSet()
/** Get the boost factor associated with the given document, or the default
* boost value if not found.
*
* @param docId Document ID to look up
* @param defaultBoost What to return if not found
* @return Boost factor, or defaultBoost if not found.
*/
public final float getBoost(int docId, float defaultBoost) {
if (docId < 0 || docId >= boostByDoc.length)
return defaultBoost;
float ret = boostByDoc[docId];
if (ret == DEFAULT_MARKER)
return defaultBoost;
return ret;
} // getBoost()
/** Do not construct directly; use
* {@link #getCachedSet(IndexReader, File, String)}
* instead. Constructs a BoostSet by reading a file containing document
* key -> boost factor mappings, and correlating it with the keys in the
* given index reader.
*/
private BoostSet(IndexReader indexReader, File inFile, String field)
throws IOException
{
this.field = field;
Trace.debug("Loading boost set '" + inFile + "'...");
// Figure out the max doc ID, make an array that big, and fill it with
// a marker for the default value.
//
int maxDoc = indexReader.maxDoc();
boostByDoc = new float[maxDoc + 1];
Arrays.fill(boostByDoc, DEFAULT_MARKER);
// Iterate all the keys in the index.
DocIter docIter = null;
LineIter lineIter = null;
try
{
docIter = new DocIter(indexReader, field);
lineIter = new LineIter(new BufferedReader(new FileReader(inFile)));
// Process all matches
while (!docIter.done() && !lineIter.done())
{
String docKey = docIter.key();
String lineKey = lineIter.key();
int diff = docKey.compareTo(lineKey);
if (diff < 0)
{
//System.out.println( "Skipping doc " + docKey );
docIter.next();
continue;
}
else if (diff > 0) {
warn("Boost document key '" + lineKey + "' not found in index");
lineIter.next();
continue;
}
// Found a match.
int docId = docIter.docId();
if (docId >= 0 && docId < boostByDoc.length)
boostByDoc[docId] = lineIter.boost();
docIter.next();
lineIter.next();
}
// Warn about any leftover docs
while (!docIter.done())
{
//System.out.println( "Skipping doc " + docIter.key() );
docIter.next();
}
// Warn about any leftover lines
while (!lineIter.done()) {
warn("Boost document key '" + lineIter.key() + "' not found in index");
lineIter.next();
}
Trace.debug("... done loading boost set");
}
finally {
if (docIter != null)
docIter.close();
if (lineIter != null)
lineIter.close();
}
} // constructor
/**
* If less than 10 warnings have been emitted, we print this one out.
* Otherwise, we suppress it.
*
* @param msg The message to emit
*/
private void warn(String msg) {
++nWarnings;
if (nWarnings < 10)
Trace.warning(msg);
else if (nWarnings == 10)
Trace.warning("Further warnings suppressed.");
} // warn()
/**
* Iterates all the document keys in an index
*/
private class DocIter
{
boolean done = false;
String docKey;
String field;
TermPositions termPositions;
TermEnum termEnum;
/** Construct from an index reader */
DocIter(IndexReader indexReader, String field)
throws IOException
{
this.field = field;
termPositions = indexReader.termPositions();
termEnum = indexReader.terms(new Term(field, ""));
readDocKey();
}
/** Return true if there are no more documents to read */
boolean done() {
return done;
}
/** Gets the key of the current document */
String key() {
return docKey;
}
/** Gets the Lucene document ID of the current document */
int docId()
throws IOException
{
termPositions.seek(termEnum);
if (termPositions.next())
return termPositions.doc();
assert false : "error reading term positions";
return -1;
}
/** Advances to the next document in the index */
void next()
throws IOException
{
if (!termEnum.next())
done = true;
readDocKey();
}
/** Clean up */
void close()
throws IOException
{
termPositions.close();
termEnum.close();
}
/** Fetch the current document key; update done */
private void readDocKey()
{
Term term = termEnum.term();
if (!term.field().equals(field)) {
done = true;
return;
}
docKey = term.text();
}
} // class DocIter
/**
* Iterates all the lines in a boost file
*/
private class LineIter
{
BufferedReader reader;
boolean done = false;
String prevLineKey = "";
String lineKey;
float lineBoost;
/** Construct from a reader */
LineIter(BufferedReader reader)
throws IOException
{
this.reader = reader;
readLine();
}
/** Returns true if no more lines to read */
boolean done() {
return done;
}
/** Get the document key of the current line */
String key() {
return lineKey;
}
/** Get the boost factor of the current line */
float boost() {
return lineBoost;
}
/** Advance to the next line */
void next()
throws IOException
{
readLine();
}
/** Clean up */
void close()
throws IOException
{
reader.close();
}
/** Read the next line in the file */
private void readLine()
throws IOException
{
while (!done)
{
String line = reader.readLine();
if (line == null) {
done = true;
break;
}
int sepPos = line.indexOf('|');
if (sepPos < 0) {
warn("Boost line missing separator: '" + line + "'");
continue;
}
lineKey = line.substring(0, sepPos);
lineBoost = Float.parseFloat(line.substring(sepPos + 1));
if (lineKey.compareTo(prevLineKey) <= 0) {
Trace.error(
"Error: Boost set lines out of order: '" + prevLineKey +
"' came before '" + lineKey + "', but should come after.");
done = true;
break;
}
prevLineKey = lineKey;
// Got a valid line.
break;
}
} // readLine()
} // class LineIter
} // class BoostSet