package org.cdlib.xtf.textEngine;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.FieldSpanSource;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.FieldSpans;
import org.cdlib.xtf.util.AttribList;
/**
* Represents a query hit at the document level. May contain {@link Snippet}s
* if those were requested.
*
* @author Martin Haye
*/
public class DocHitImpl extends DocHit
{
/** Used to load and format snippets */
private SnippetMaker snippetMaker;
/** Source of spans. Only valid during collection. */
private FieldSpanSource fieldSpanSource;
/** Spans per field */
private FieldSpans fieldSpans;
/** Array of pre-built snippets */
private Snippet[] snippets;
/** Index key for this document */
private String docKey;
/** Date the original source XML document was last modified */
@SuppressWarnings("unused")
private long fileDate = -1;
/** Record number of this document within the main file */
private int recordNum = 0;
/** Record the subdocument within the main file, if any */
private String subDocument = null;
/** Total number of chunks for this document */
private int chunkCount = -1;
/** Document's meta-data fields (copied from the docInfo chunk) */
private AttribList metaData;
/** Explanation of this document's score */
private Explanation explanation;
/**
* Construct a document hit. Package-private because these should only
* be constructed inside the text engine.
*
* @param docNum Lucene ID for the document info chunk
* @param score Score for this hit
*/
DocHitImpl(int docNum, float score) {
super(docNum, score);
}
/**
* Sets the source for spans (to perform deduplication)
*/
void setSpanSource(FieldSpanSource src) {
this.fieldSpanSource = src;
}
/**
* Called after all hits have been gathered to normalize the scores and
* associate a snippetMaker for later use.
*
* @param snippetMaker Will be used later by snippet() to actually
* create the snippets.
* @param docScoreNorm Multiplied into the document's score
*/
void finish(SnippetMaker snippetMaker, float docScoreNorm)
{
// Don't do this twice.
if (this.snippetMaker != null)
return;
// Record the snippet maker... we'll use it later if loading is
// necessary.
//
this.snippetMaker = snippetMaker;
// Adjust our score.
score *= docScoreNorm;
} // finish()
/**
* Called after all hits have been gathered to normalize the scores and
* associate a snippetMaker for later use. Also calculates an explanation
* of the score.
*
* @param snippetMaker Will be used later by snippet() to actually
* create the snippets.
* @param docScoreNorm Multiplied into the document's score
* @param weight The query weight that will be used to calculate
* an explanation.
* @param boostSet The boost set used, or null if none
* @param boostParams Other boost set parameters (e.g. exponent)
*/
void finishWithExplain(SnippetMaker snippetMaker, float docScoreNorm,
Weight weight, BoostSet boostSet,
BoostSetParams boostParams)
throws IOException
{
// Don't do this twice.
if (this.snippetMaker != null)
return;
// Do the normal work first.
finish(snippetMaker, docScoreNorm);
// And figure out an explanation.
explanation = weight.explain(snippetMaker.reader, doc);
// Add any boost set stuff if necessary.
if (boostSet != null)
{
Explanation result = new Explanation(0, "boosted, product of:");
Explanation boostExpl = new Explanation(boostSet.getBoost(
doc,
boostParams.defaultBoost),
"boostSetFactor");
if (boostParams.exponent != 1.0f) {
Explanation exponentExpl = new Explanation(
(float)Math.pow(boostExpl.getValue(), boostParams.exponent),
"exponentBoosted");
exponentExpl.addDetail(boostExpl);
exponentExpl.addDetail(new Explanation(boostParams.exponent,
"boostSetExponent"));
boostExpl = exponentExpl;
}
result.addDetail(boostExpl);
result.addDetail(explanation);
result.setValue(boostExpl.getValue() * explanation.getValue());
explanation = result;
}
} // finishWithExplain()
/**
* Read in the document info chunk and record the path, date, etc. that
* we find there.
*/
private void load()
{
// Read in our fields
Document docContents;
try {
assert !snippetMaker.reader.isDeleted(doc);
docContents = snippetMaker.reader.document(doc);
}
catch (IOException e) {
throw new HitLoadException(e);
}
// The query request may have specified a limited set of fields to return.
Set returnMetaFields = snippetMaker.returnMetaFields();
// Record the ones of interest.
metaData = new AttribList();
for (Field f : (List<Field>)docContents.getFields())
{
String name = f.name();
String value = f.stringValue();
if (name.equals("key"))
docKey = value;
else if (name.equals("fileDate")) {
try {
fileDate = DateTools.stringToTime(value);
}
catch (java.text.ParseException e1) {
}
}
else if (name.equals("chunkCount"))
chunkCount = Integer.parseInt(value);
else if (name.equals("recordNum"))
recordNum = Integer.parseInt(value);
else if (name.equals("subDocument"))
subDocument = value;
else if (name.equals("docInfo"))
; // skip the docInfo field since it's internal
else if (returnMetaFields == null || returnMetaFields.contains(name))
{
// Note: We cannot use f.isTokenized() below, because in the case of
// facet values we tokenize in Lucene-land but in XTF land
// consider them to be un-tokenized. Hence the use of
// snippetMaker.tokFields() instead.
//
loadMetaField(name, value, docContents, metaData,
snippetMaker.tokFields().contains(f.name()));
}
}
// We should have gotten at least the special fields.
assert docKey != null : "Incomplete data in index - missing 'key'";
assert chunkCount != -1 : "Incomplete data in index - missing 'chunkCount'";
} // load()
/**
* Performs all the manipulations and marking for a meta-data field.
*
* @param name Name of the field
* @param value Raw string value of the field
* @param docContents Where to get spans from
* @param metaData Where to put the resulting data
* @param isTokenized true if the field was tokenized and should be
* marked.
*/
private void loadMetaField(String name, String value, Document docContents,
AttribList metaData, boolean isTokenized)
{
// First, mark up the value.
String markedValue;
if (isTokenized)
markedValue = snippetMaker.markField(docContents, fieldSpans, name, value);
else
markedValue = value;
// Now fix up the result. This involves three transformations:
// (1) Strip the special start-of-field and end-of-field tokens; and
// (2) Insert proper <element>...</element> tags if they were left out
// to save index space.
// (3) Lucene will fail subtly if we add two fields with the same
// name. Basically, the terms for each field are added at
// overlapping positions, causing a phrase search to easily
// span them. To counter this, the text indexer artificially
// introduces bump markers between them. And now, we reverse
// the process so it's invisible to the end-user.
//
// Note: the previous version of this code did not handle XML elements
// as the only content of a metadata field, and did not handle
// XML in multiple valued fields.
//
StringBuilder buf = new StringBuilder(markedValue.length() * 2);
char[] chars = markedValue.toCharArray();
for (int i = 0; i < chars.length; i++)
{
char c = chars[i];
// Insert element start tag. There will be a placeholder if there were
// attributes, otherwise we have to fill in the whole thing.
//
if (c == '<' && i < markedValue.length()-2 && markedValue.charAt(i+1) == '$') {
buf.append('<');
buf.append(name);
i++; // skip $
continue;
}
else if (buf.length() == 0) {
buf.append('<');
buf.append(name);
buf.append(">");
}
// Copy normal characters.
if (c != Constants.FIELD_START_MARKER &&
c != Constants.FIELD_END_MARKER &&
c != Constants.BUMP_MARKER)
{
buf.append(c);
}
// At end of field (or subfield), insert element end tag and write the metadata.
if (i == chars.length-1 || c == Constants.BUMP_MARKER) {
buf.append("</");
buf.append(name);
buf.append('>');
String cleanedVal = buf.toString();
metaData.put(name, cleanedVal);
buf.delete(0, buf.length());
// Bump markers come in pairs; skip past the second one.
if (c == Constants.BUMP_MARKER) {
i++;
while (i < chars.length && chars[i] != Constants.BUMP_MARKER)
i++;
i++;
// Eat extra space after the bump marker
if (i < chars.length && Character.isWhitespace(chars[i]))
i++;
i--; // because loop above will increment it
}
}
} // for i
} // loadMetaField()
/**
* Fetch a map that can be used to check whether a given term is present
* in the original query that produced this hit.
*/
public Set textTerms() {
if (fieldSpans == null) {
if (fieldSpanSource != null)
fieldSpans = fieldSpanSource.getSpans(doc);
else
return null;
}
return fieldSpans.getTerms("text");
}
/**
* Retrieve the original file path as recorded in the index (if any.)
*/
public final String filePath() {
if (docKey == null)
load();
return docKey;
} // filePath()
/**
* Retrieve the record number of this document within the main file, or
* zero if this is the only record.
*/
public final int recordNum() {
if (docKey == null)
load();
return recordNum;
} // filePath()
/**
* Retrieve the subdocument name of this section within the main
* file, if any.
*/
public final String subDocument() {
if (docKey == null)
load();
return subDocument;
}
/**
* Retrieve a list of all meta-data name/value pairs associated with this
* document.
*/
public final AttribList metaData() {
if (docKey == null)
load();
return metaData;
}
/** Return the total number of snippets found for this document (not the
* number actually returned, which is limited by the max # of snippets
* specified in the query.)
*/
public final int totalSnippets() {
if (fieldSpans == null) {
if (fieldSpanSource != null)
fieldSpans = fieldSpanSource.getSpans(doc);
else
return 0;
}
return fieldSpans.getSpanTotal("text");
}
/**
* Return the number of snippets available (limited by the max # specified
* in the original query.)
*/
public final int nSnippets() {
if (fieldSpans == null) {
if (fieldSpanSource != null)
fieldSpans = fieldSpanSource.getSpans(doc);
else
return 0;
}
return fieldSpans.getSpanCount("text");
}
/**
* Retrieve the specified snippet.
*
* @param hitNum 0..nSnippets()
* @param getText true to fetch the snippet text in context, false to
* only fetch the rank, score, etc.
*/
public final Snippet snippet(int hitNum, boolean getText)
{
// If we haven't built the snippets yet (or if we didn't get the
// text for them), do so now.
//
if (snippets == null || (getText && snippets[hitNum].text == null))
snippets = snippetMaker.makeSnippets(fieldSpans, doc, "text", getText);
// Return the pre-built snippet.
return snippets[hitNum];
} // snippet()
/** Retrieve an explanation of this document's score */
public Explanation explanation() {
return explanation;
}
} // class DocHit