package com.bigdata.search;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.lucene.analysis.Token;
/**
* Models the term-frequency data associated with a single field of some
* document.
*
* @param <V>
* The generic type of the document identifier.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class TermFrequencyData<V extends Comparable<V>> {
/** The document identifier. */
final public V docId;
/** The field identifier. */
final public int fieldId;
/** The #of terms added (includes duplicates). */
private int totalTermCount = 0;
/**
* The set of distinct tokens and their {@link ITermMetadata}.
*/
final public LinkedHashMap<String, ITermMetadata> terms = new LinkedHashMap<String, ITermMetadata>();
public TermFrequencyData(final V docId, final int fieldId,
final String token) {
// Note: will be null when indexing a query.
// if (docId == null)
// throw new IllegalArgumentException();
this.docId = docId;
this.fieldId = fieldId;
add( token );
}
/**
* Add a {@link Token}.
*
* @param token
* The token.
*
* @return true iff the termText did not previously exist for this
* {@link TermFrequencyData}.
*/
public boolean add(final String token) {
final boolean newTerm;
ITermMetadata termMetadata = terms.get(token);
if (termMetadata == null) {
termMetadata = new TermMetadata();
terms.put(token, termMetadata);
newTerm = true;
} else {
newTerm = false;
}
termMetadata.add();
totalTermCount++;
return newTerm;
}
/**
* The #of distinct terms.
*/
public int distinctTermCount() {
return terms.size();
}
/**
* The total #of terms, including duplicates.
*/
public int totalTermCount() {
return totalTermCount;
}
/**
* Computes the normalized term-frequency vector. This is a unit vector
* whose magnitude is <code>1.0</code>. The magnitude of the term frequency
* vector is computed using the integer term frequency values reported by
* {@link TermMetadata#termFreq()}. The normalized values are then set on
* {@link TermMetadata#localTermWeight}.
*
* @return The magnitude of the un-normalized
* {@link TermMetadata#termFreq()} vector.
*/
public double normalize() {
/*
* Compute magnitude.
*/
double magnitude = 0d;
for(ITermMetadata md : terms.values()) {
final double termFreq = md.termFreq();
// sum of squares.
magnitude += (termFreq * termFreq);
}
magnitude = Math.sqrt(magnitude);
/*
* normalizedWeight = termFreq / magnitude for each term.
*/
for(ITermMetadata md : terms.values()) {
final double termFreq = md.termFreq();
md.setLocalTermWeight(termFreq / magnitude);
}
return magnitude;
}
public Map.Entry<String, ITermMetadata> getSingletonEntry() {
if (terms.size() != 1) {
throw new RuntimeException("not a singleton");
}
return terms.entrySet().iterator().next();
}
}