/*******************************************************************************
* Copyright (c) 2012 György Orosz, Attila Novák.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Lesser Public License v3
* which accompanies this distribution, and is available at
* http://www.gnu.org/licenses/
*
* This file is part of PurePos.
*
* PurePos is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* PurePos is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* Contributors:
* György Orosz - initial API and implementation
******************************************************************************/
package hu.ppke.itk.nlpg.purepos.model;
import java.io.Serializable;
import java.util.Map;
/**
* Implementors should implement a representation of a tree with word suffixes
* combined with suffox counts.
*
* @author György Orosz
*
* @param <W>
* Word type
* @param <T>
* Tag type
*/
public abstract class SuffixTree<W, T> implements Serializable {
/**
*
*/
private static final long serialVersionUID = -684858638817631397L;
/*
* maximum length of suffixes which are stored
*/
// protected static Logger logger = Logger.getLogger(SuffixTree.class);
protected final int maxSuffixLength;
public SuffixTree(int maxSuffixLength) {
this.maxSuffixLength = maxSuffixLength;
}
/**
* Adds a word with a specific and count to the representation.
*
* @param word
* word added
* @param tag
* tag added
*
* @param count
* tag count added
*/
public abstract void addWord(W word, T tag, int count);
/**
* Adds a word with a specific and count and minimum length to the
* representation.
*
* @param word
* word added
* @param tag
* tag added
*
* @param minLen
* minimum number of suffixes stored in the tree
*
* @param count
* tag count added
*/
public abstract void addWord(W word, T tag, int count, int minLen);
/**
* Using theta, it creates the guesser object.
*
* @return a suffix guesser
*/
public abstract ISuffixGuesser<W, T> createGuesser(double theta// ,
// Map<T, Double> apriori
);
/**
* Calculate theta from the apriori probabilities.
*
* Using weighted average for standard deviation: E_{P_t}(P_t()). For
* details see libmoot.
*
* @param aprioriProbs
* @return the value of theta
*/
public static <T> double calculateTheta(Map<T, Double> aprioriProbs) {
// TODO: RESEARCH: understand how it really works -> weighted average of
// stddev
// TODO: it can be moved to some util class as a static method
// logger.trace("AprioriProbs: " + aprioriProbs);
double pAv = 0;
for (Double val : aprioriProbs.values()) {
pAv += Math.pow(val, 2);
}
double theta = 0;
for (Double aProb : aprioriProbs.values()) {
theta += aProb * Math.pow(aProb - pAv, 2);
}
return Math.sqrt(theta);
}
}