/*
* #!
* Ontopia Classify
* #-
* Copyright (C) 2001 - 2013 The Ontopia Project
* #-
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* !#
*/
package net.ontopia.topicmaps.classify;
import java.util.Arrays;
import java.util.Comparator;
import net.ontopia.utils.ObjectUtils;
import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.iterator.TObjectIntIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* PUBLIC: Represents a concept which occurs in the classified
* content. A term can have many variants, all of which can be found
* from this object. It also has a score, indicating the importance of
* the term within the content.
*/
public class Term {
// Define a logging category.
static Logger log = LoggerFactory.getLogger(Term.class.getName());
protected String stem;
protected double score = 1.0d;
protected int totalOccurrences;
protected TObjectIntHashMap<Variant> variants = new TObjectIntHashMap<Variant>();
Term(String stem) {
this.stem = stem;
}
/**
* PUBLIC: Returns the stem common to all variants of the term.
* Often, the stem does not actually occur in the content.
*/
public String getStem() {
return stem;
}
/**
* PUBLIC: Returns the term's score, a number in the range 0-1,
* indicating its importance within the content.
*/
public double getScore() {
return score;
}
/**
* PUBLIC: Returns all variant spellings of this term within the
* content.
*/
public Variant[] getVariants() {
return variants.keys(new Variant[] {});
}
/**
* PUBLIC: Returns all variant spellings of this term within the
* content, with the most important first.
*/
public Variant[] getVariantsByRank() {
Variant[] ranked = getVariants();
Arrays.sort(ranked, new VariantComparator());
return ranked;
}
/**
* PUBLIC: Returns the number of times the term occurred within the
* classified content.
*/
public int getOccurrences() {
return totalOccurrences;
}
/**
* PUBLIC: Returns the preferred variant of the term. This is a form
* of the term which actually occurred in the classified content.
*/
public String getPreferredName() {
if (variants.isEmpty())
return getStem();
Variant maxKey = null;
int maxValue = -1;
TObjectIntIterator<Variant> iter = variants.iterator();
while (iter.hasNext()) {
iter.advance();
int thisValue = iter.value();
Variant thisKey = iter.key();
// select variant with most occurrences, or lowest lexical value if equal for predictability
if ((thisValue > maxValue) ||
((thisValue == maxValue) && (thisKey.getValue().compareTo(maxKey.getValue()) < 0))) {
maxValue = thisValue;
maxKey = thisKey;
}
}
return maxKey.getValue();
}
protected double getScore(Variant v) {
return (1.0d * getOccurrences(v)) / totalOccurrences;
}
protected int getOccurrences(Variant variant) {
return variants.get(variant);
}
protected void setScore(double score, String reason) {
if (score <= 0.0d)
throw new RuntimeException("Score is not nillable: " + score + " term: " + this);
log.debug(">" + stem + "< =" + score + ", " + reason);
this.score = score;
}
protected void addScore(double ascore, String reason) {
this.score += ascore;
log.debug(">" + stem + "< +" + ascore + "=" + score + ", " + reason);
}
protected void multiplyScore(double factor, String reason) {
this.score = score * factor;
log.debug(">" + stem + "< *" + factor + "=" + score + ", " + reason);
}
protected void divideScore(double factor, String reason) {
this.score = score / factor;
log.debug(">" + stem + "< /" + factor + "=" + score + ", " + reason);
}
protected void addVariant(Variant variant) {
addVariant(variant, 1);
}
protected void addVariant(Variant variant, int occurrences) {
if (variants.get(variant) > 0)
variants.increment(variant);
else
variants.put(variant, occurrences);
totalOccurrences += occurrences;
}
protected void merge(Term other) {
if (other == this) return;
this.score = this.score + other.score;
this.totalOccurrences = this.totalOccurrences + other.totalOccurrences;
TObjectIntIterator<Variant> iter = other.variants.iterator();
while (iter.hasNext()) {
iter.advance();
Variant key = iter.key();
int value = iter.value();
if (this.variants.containsKey(key))
this.variants.adjustValue(key, value);
else
this.variants.put(key, value);
key.replaceTerm(this);
}
}
public String toString() {
return '\'' + getStem() + "\'" + getScore() + ":" + (variants.isEmpty() ? "" : Arrays.asList(variants.keys()).toString());
}
protected static Comparator<Term> SCORE_COMPARATOR =
new Comparator<Term>() {
public int compare(Term t1, Term t2) {
return ObjectUtils.compare(t2.getScore(), t1.getScore()); // NOTE: reverse order
}
};
private class VariantComparator implements Comparator<Variant> {
public int compare(Variant v1, Variant v2) {
int c = ObjectUtils.compare(getOccurrences(v2), getOccurrences(v1)); // NOTE: reverse order
if (c != 0) return c;
return v1.getValue().compareTo(v2.getValue());
}
};
}