package at.lux.retrieval.suffixtreemodel;
import java.util.*;
import java.util.logging.Logger;
/**
* <p/>
* Date: 15.02.2006 <br>
* Time: 20:31:57 <br>
* Know-Center Graz, Inffeldgasse 21a, 8010 Graz, AUSTRIA <br>
*
* @author Mathias Lux, mlux@know-center.at
*/
public class SuffixTreeNode {
private Logger log = Logger.getLogger(SuffixTreeNode.class.getName());
private String token;
/**
* Stores how often a single document has traversed given edge.
* Used for similarity calculation
*/
private HashMap<String, HashMap<Integer, Integer>> edge2document = new HashMap<String, HashMap<Integer, Integer>>();
/**
* Stores which documents have traversed the edge.
* This is only created to allow calculation of the IDF value
* based on a corpus.
*/
private HashMap<String, HashSet<Integer>> idfCounter = new HashMap<String, HashSet<Integer>>();
private HashMap<String, SuffixTreeNode> edges = new HashMap<String, SuffixTreeNode>();
public SuffixTreeNode(String token) {
this.token = token;
}
/**
* @param tokens the suffix
* @param id the document id
*/
public void addSuffix(List<String> tokens, int id) {
// System.out.println("tokens = " + tokens);
String t = tokens.get(0);
// add edge if not here
if (!edges.containsKey(t)) {
edges.put(t, new SuffixTreeNode(t));
}
if (!edge2document.containsKey(t)) {
HashMap<Integer, Integer> docs = new HashMap<Integer, Integer>(3);
edge2document.put(t, docs);
}
if (!edge2document.get(t).containsKey(id)) {
edge2document.get(t).put(id, 0);
}
// add id to edge:
edge2document.get(t).put(id, edge2document.get(t).get(id) + 1);
// add suffices if there is more than one token left:
if (tokens.size() > 1) {
ArrayList<String> tokensLeft = new ArrayList<String>(tokens.size());
tokensLeft.addAll(tokens);
tokensLeft.remove(0);
edges.get(t).addSuffix(tokensLeft, id);
}
}
public boolean isLeaf() {
return (edges.isEmpty());
}
/**
* Calculates how many edges have been created or traversed by document with id.
* stores the edges which were traversed by both in doc id "-1";
*
* @param document2edgeCount
*/
public void getEdgesTraversed(HashMap<Integer, Integer> document2edgeCount) {
// go through all local edges ...
for (String edge : edge2document.keySet()) {
Set<Integer> documents = edge2document.get(edge).keySet();
for (Integer docID : documents) {
// increment for document
document2edgeCount.put(docID, document2edgeCount.get(docID) + 1);
}
if (documents.size() > 1) {
// increment count for doc with id -1 (the edge count).
document2edgeCount.put(-1, document2edgeCount.get(-1) + 1);
}
}
// ask all children, which are not leafs ...
for (String token : edges.keySet()) {
SuffixTreeNode node = edges.get(token);
if (!node.isLeaf() && edge2document.containsKey(token)) {
node.getEdgesTraversed(document2edgeCount);
}
}
}
public void traverseEdges(TermFrequencyWalker walker, SuffixTree.SimilarityType similarityType) {
// go through all local edges ...
for (String s : edge2document.keySet()) {
Set<Integer> documents = edge2document.get(s).keySet();
int min = Integer.MAX_VALUE;
int max = 0;
if (edge2document.get(s).size() >1) {
for (Integer docID : documents) {
Integer travCount = edge2document.get(s).get(docID);
min = Math.min(min, travCount);
max = Math.max(max, travCount);
}
} else {
min = 0;
max = 1;
}
if (similarityType == SuffixTree.SimilarityType.TermFrequency) {
// its term frequeny:
walker.addToSum((double) min / (double) max);
} else if (similarityType == SuffixTree.SimilarityType.IDF) {
int documentFrequency;
if (!idfCounter.containsKey(s)) {
// Note: the documents to check against were obviously not in the corpus!!
documentFrequency = edge2document.get(s).keySet().size();
log.warning("Note: the documents for similarity calculation were obviously not in the corpus!!");
} else {
documentFrequency = idfCounter.get(s).size();
}
double idf = Math.log((double) walker.getCountCorpusDocuments() / (double) documentFrequency);
// IDF factor is normalized to stay in the [0,1] area.
if (min > 0)
walker.addToSum(idf);
} else if (similarityType == SuffixTree.SimilarityType.TFIDF) {
// its TF*IDF:
double factor = (double) min / (double) max;
int documentFrequency;
if (!idfCounter.containsKey(s)) {
// Note: the documents to check against were obviously not in the corpus!!
documentFrequency = edge2document.get(s).keySet().size();
log.warning("Note: the documents for similarity calculation were obviously not in the corpus!!");
} else {
documentFrequency = idfCounter.get(s).size();
}
// double maxIdf = Math.log((double) walker.getCountCorpusDocuments());
double idf = Math.log((double) walker.getCountCorpusDocuments() / (double) documentFrequency);
// assert(factor <= 1.0);
// assert(idf / maxIdf <= 1.0);
// IDF factor is normalized to stay in the [0,1] area.
walker.addToSum(factor * idf);
// walker.addToSum(factor * (idf / maxIdf));
} else
throw new UnsupportedOperationException("Not implemened yet!");
walker.incrementCountEdges();
}
// ask all children, which are not leafs and have edges for given docs ...
for (String token : edges.keySet()) {
SuffixTreeNode node = edges.get(token);
if (!node.isLeaf() && edge2document.containsKey(token)) {
node.traverseEdges(walker, similarityType);
}
}
}
/**
* Allows to train the tree for a specific document corpus on the document frequency for
* later idf calculation
*
* @param tokens
* @param id
*/
public void prepareDocumentFrequency(List<String> tokens, int id) {
String t = tokens.get(0);
// add edge if not here
if (!edges.containsKey(t)) {
edges.put(t, new SuffixTreeNode(t));
}
if (!idfCounter.containsKey(t)) {
idfCounter.put(t, new HashSet<Integer>());
}
idfCounter.get(t).add(id);
if (tokens.size() > 1) {
ArrayList<String> tokensLeft = new ArrayList<String>(tokens.size());
tokensLeft.addAll(tokens);
tokensLeft.remove(0);
edges.get(t).prepareDocumentFrequency(tokensLeft, id);
}
}
public void resetSimilarity() {
// reset edge2document
edge2document = new HashMap<String, HashMap<Integer, Integer>>();
// reset all childs:
for (String edge : edges.keySet()) {
SuffixTreeNode node = edges.get(edge);
if (!node.isLeaf()) node.resetSimilarity();
}
}
}