package org.cdlib.xtf.textEngine;
/**
* Copyright (c) 2006, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.WeakHashMap;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.util.IntList;
import org.apache.lucene.util.LongList;
/**
* Holds numeric data for a field from a Lucene index. Data is cached for a
* given index reader, to speed access after the initial load.
*
* @author Martin Haye
*/
public class NumericFieldData
{
/** Cached data. If the reader goes away, our cache will too. */
private static WeakHashMap cache = new WeakHashMap();
/** Document IDs containing values for the field */
private IntList docs = new IntList();
/** Associated numeric value for each document */
private LongList values = new LongList();
/**
* Retrieves tags for a given field from a given reader. Maintains a cache
* so that if the same fields are requested again for this reader, we don't have
* to re-read the tags.
*
* @param reader Where to read the tags from
* @param field Which field to read
* @return FRBR tags for the specified field
*/
public static NumericFieldData getCachedData(IndexReader reader, String field)
throws IOException
{
// See if we have a cache for this reader.
HashMap readerCache = (HashMap)cache.get(reader);
if (readerCache == null) {
readerCache = new HashMap();
cache.put(reader, readerCache);
}
// Now see if we've already read data for this field.
NumericFieldData data = (NumericFieldData)readerCache.get(field);
if (data == null)
{
// Don't have cached data, so read and remember it.
data = new NumericFieldData(reader, field);
readerCache.put(field, data);
}
return data;
} // getCachedTags()
/** Parse the numeric characters of a string, ignoring all non-digits */
public static long parseVal(String str)
{
long ret = 0;
for (int i = 0; i < str.length(); i++)
{
int digit = Character.digit(str.charAt(i), 10);
if (digit >= 0) {
ret = (ret * 10) + digit;
}
}
return ret;
}
/**
* Load data from the given field of the reader, and parse the values as
* numbers.
*/
private NumericFieldData(IndexReader reader, String field)
throws IOException
{
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms(new Term(field, ""));
try
{
// First, collect all the doc/value pairs.
if (termEnum.term() == null)
throw new IOException("no terms in field " + field);
do
{
Term term = termEnum.term();
if (term.field() != field)
break;
String termText = term.text();
// Skip terms with the special XTF field markers.
if (termText.length() > 1) {
if (termText.charAt(0) == Constants.FIELD_START_MARKER)
continue;
if (termText.charAt(termText.length() - 1) == Constants.FIELD_END_MARKER)
continue;
}
long value = parseVal(termText);
termDocs.seek(termEnum);
while (termDocs.next()) {
int doc = termDocs.doc();
docs.add(doc);
values.add(value);
}
} while (termEnum.next());
// Save space.
docs.compact();
values.compact();
// Now sort by document ID, and apply the same ordering to the values,
// to keep them in sync.
//
int[] map = docs.calcSortMap();
docs.remap(map);
values.remap(map);
// Check to be sure no documents have multiple values.
for (int i = 1; i < docs.size(); i++)
{
if (docs.get(i - 1) == docs.get(i)) {
throw new IOException(
"A document contains more than one value in numeric field '" +
field + "': values " + values.get(i - 1) + " and " + values.get(i));
}
} // for
} // try
finally {
termEnum.close();
termDocs.close();
}
} // constructor
public final int size() {
return docs.size();
}
public final int doc(int index) {
return docs.get(index);
}
public final long value(int index) {
return values.get(index);
}
public final int findDocIndex(int docId) {
int idx = docs.binarySearch(docId);
if (idx >= 0)
return idx;
else
return -idx - 1; // from -ins - 1
}
public final int docPos(int docId) {
return docs.binarySearch(docId);
}
} // class NumericFieldData