package edu.hawaii.jmotif.performance;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import org.hackystat.utilities.stacktrace.StackTrace;
import edu.hawaii.jmotif.sax.SAXFactory;
import edu.hawaii.jmotif.sax.alphabet.Alphabet;
import edu.hawaii.jmotif.sax.alphabet.NormalAlphabet;
import edu.hawaii.jmotif.text.SAXCollectionStrategy;
import edu.hawaii.jmotif.text.TextUtils;
import edu.hawaii.jmotif.text.WordBag;
import edu.hawaii.jmotif.timeseries.TSException;
public class UCRClassificationSeriesJob implements Callable<String> {
/** The latin alphabet, lower case letters a-z. */
private static final char[] ALPHABET = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' };
private static final String COMMA = ",";
private SAXCollectionStrategy strategy;
private int windowSize;
private int paaSize;
private int alphabetSize;
private Map<String, List<double[]>> trainData;
private Map<String, List<double[]>> testData;
public UCRClassificationSeriesJob(SAXCollectionStrategy strategy, int windowSize, int paaSize,
int alphabetSize, Map<String, List<double[]>> testData, Map<String, List<double[]>> trainData) {
this.strategy = strategy;
this.windowSize = windowSize;
this.paaSize = paaSize;
this.alphabetSize = alphabetSize;
this.trainData = trainData;
this.testData = testData;
}
@Override
public String call() throws Exception {
try {
// params
int[] params = new int[4];
params[0] = windowSize;
params[1] = paaSize;
params[2] = alphabetSize;
params[3] = strategy.index();
// making training bags collection
List<WordBag> bags = TextUtils.labeledSeries2WordBags(trainData, params);
// HashMap<String, HashMap<String, Double>> tfidf = TextUtils.computeTFIDF(bags);
HashMap<String, HashMap<String, Double>> tfidf = computeTFIDF(bags);
// tfidf = TextUtils.normalizeToUnitVectors(tfidf);
tfidf = normalizeToUnitVectors(tfidf);
int totalTestSample = 0;
int totalPositiveTests = 0;
for (String currenClassUnderTest : testData.keySet()) {
List<double[]> testD = testData.get(currenClassUnderTest);
int positives = 0;
for (double[] series : testD) {
// positives = positives
// + TextUtils.classify(currenClassUnderTest, series, tfidf, params, strategy);
positives = positives + classify(currenClassUnderTest, series, tfidf, params, strategy);
totalTestSample++;
}
totalPositiveTests = totalPositiveTests + positives;
}
double accuracy = (double) totalPositiveTests / (double) totalTestSample;
double error = 1.0d - accuracy;
String res = "ok_" + toLogStr(params, strategy, 1.0D - error, error);
return res;
}
catch (Exception e) {
return StackTrace.toString(e);
}
}
/**
* Computes TF*IDF values.
*
* @param texts The collection of text documents for which the statistics need to be computed.
* @return The map of source documents names to the word - tf*idf weight collections.
*/
private HashMap<String, HashMap<String, Double>> computeTFIDF(Collection<WordBag> texts) {
// the number of docs
int totalDocs = texts.size();
// the result. map of document names to the pairs word - tfidf weight
HashMap<String, HashMap<String, Double>> res = new HashMap<String, HashMap<String, Double>>();
// build a collection of all observed words and their frequency in corpus
TreeMap<String, Integer> allWords = new TreeMap<String, Integer>();
for (WordBag bag : texts) {
// here populate result map with empty entries
res.put(bag.getLabel(), new HashMap<String, Double>());
// and get those words
for (Entry<String, Integer> e : bag.getWords().entrySet()) {
Integer oldFrequency = allWords.get(e.getKey());
if (null == oldFrequency) {
allWords.put(e.getKey(), 1);
}
else {
allWords.put(e.getKey(), oldFrequency + 1);
}
}
}
// outer loop - iterating over documents
for (WordBag bag : texts) {
// fix the doc name
String bagName = bag.getLabel();
HashMap<String, Integer> bagWords = bag.getWords(); // these are words of documents
// what we want to do for TF*IDF is to compute it for all WORDS ever seen in set
//
for (Entry<String, Integer> word : allWords.entrySet()) {
// by default it is zero
//
double tfidf = 0;
// if this document contains the word - here we go
if (bagWords.containsKey(word.getKey())) {
int wordFrequency = bagWords.get(word.getKey());
// compute TF: we take a log and correct for 0 by adding 1
double tfValue = 1.0D + Math.log(Integer.valueOf(wordFrequency).doubleValue());
// compute the IDF
//
double idfLOGValue = Math.log10(Integer.valueOf(totalDocs).doubleValue()
/ Integer.valueOf(word.getValue()).doubleValue());
// and the TF-IDF
//
tfidf = tfValue * idfLOGValue;
}
res.get(bagName).put(word.getKey(), tfidf);
}
}
return res;
}
/**
* Computes a cosine normalization of TFIDF statistics.
*
* @param data The data.
* @return The normalized tfidf statistics.
*/
private HashMap<String, HashMap<String, Double>> normalizeToUnitVectors(
HashMap<String, HashMap<String, Double>> data) {
// result
HashMap<String, HashMap<String, Double>> res = new HashMap<String, HashMap<String, Double>>();
// cosine normalize these rows corresponding to docs TFIDF
//
for (Entry<String, HashMap<String, Double>> e : data.entrySet()) {
double sum = 0D;
for (double el : e.getValue().values()) {
if (!(0. == el)) {
sum = sum + el * el;
}
}
double sqRoot = Math.sqrt(sum);
//
// here is normalization coefficient is calculated - all the elements must be divided by its
// value
HashMap<String, Double> newEntry = new HashMap<String, Double>(e.getValue().size());
for (Entry<String, Double> val : e.getValue().entrySet()) {
if (val.getValue().equals(0D)) {
newEntry.put(val.getKey(), 0D);
}
else {
newEntry.put(val.getKey(), val.getValue() / sqRoot);
}
}
res.put(e.getKey(), newEntry);
}
return res;
}
private String getStrategy(SAXCollectionStrategy strategy) {
String strategyP = "noreduction";
if (SAXCollectionStrategy.EXACT.equals(strategy)) {
strategyP = "exact";
}
if (SAXCollectionStrategy.CLASSIC.equals(strategy)) {
strategy = SAXCollectionStrategy.CLASSIC;
strategyP = "classic";
}
return strategyP;
}
private int classify(String classKey, double[] series,
HashMap<String, HashMap<String, Double>> tfidf, int[] params, SAXCollectionStrategy strategy)
throws IndexOutOfBoundsException, TSException {
WordBag test = seriesToWordBag("test", series, params, strategy);
double minDist = -1.0d;
String className = "";
double[] cosines = new double[tfidf.entrySet().size()];
int index = 0;
for (Entry<String, HashMap<String, Double>> e : tfidf.entrySet()) {
double dist = TextUtils.cosineSimilarity(test, e.getValue());
cosines[index] = dist;
index++;
if (dist > minDist) {
className = e.getKey();
minDist = dist;
}
}
boolean allEqual = true;
double cosine = cosines[0];
for (int i = 1; i < cosines.length; i++) {
if (!(cosines[i] == cosine)) {
allEqual = false;
}
}
if (!(allEqual) && className.equalsIgnoreCase(classKey)) {
return 1;
}
return 0;
}
private double cosineSimilarity(WordBag testSample, HashMap<String, Double> weightVector) {
double res = 0;
for (Entry<String, Integer> entry : testSample.getWords().entrySet()) {
if (weightVector.containsKey(entry.getKey())) {
res = res + entry.getValue().doubleValue() * weightVector.get(entry.getKey()).doubleValue();
}
}
double m1 = magnitude(testSample.getWordsAsDoubles().values());
double m2 = magnitude(weightVector.values());
return res / (m1 * m2);
}
private double magnitude(Collection<Double> values) {
Double res = 0.0D;
for (Double v : values) {
res = res + v * v;
}
return Math.sqrt(res.doubleValue());
}
private WordBag seriesToWordBag(String label, double[] series, int[] params,
SAXCollectionStrategy strategy) throws IndexOutOfBoundsException, TSException {
Alphabet a = new NormalAlphabet();
WordBag resultBag = new WordBag(label);
int windowSize = params[0];
int paaSize = params[1];
int alphabetSize = params[2];
String oldStr = "";
for (int i = 0; i <= series.length - windowSize; i++) {
double[] paa = paa(zNormalize(subseries(series, i, windowSize)), paaSize);
char[] sax = ts2String(paa, a.getCuts(alphabetSize));
if (SAXCollectionStrategy.CLASSIC.equals(strategy)) {
if (oldStr.length() > 0 && SAXFactory.strDistance(sax, oldStr.toCharArray()) == 0) {
continue;
}
}
else if (SAXCollectionStrategy.EXACT.equals(strategy)) {
if (oldStr.equalsIgnoreCase(String.valueOf(sax))) {
continue;
}
}
oldStr = String.valueOf(sax);
resultBag.addWord(String.valueOf(sax));
}
return resultBag;
}
/**
* Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as
* follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there
* are some (more or equal one) values happened to be in the piece - algorithm will handle it as
* usual - getting the mean.
*
* @param ts The timeseries to approximate.
* @param paaSize The desired length of approximated timeseries.
* @return PAA-approximated timeseries.
* @throws TSException if error occurs.
*/
private double[] paa(double[] ts, int paaSize) throws TSException {
// fix the length
int len = ts.length;
// check for the trivial case
if (len == paaSize) {
return Arrays.copyOf(ts, ts.length);
}
else {
// get values and timestamps
double[][] vals = asMatrix(ts);
// work out PAA by reshaping arrays
double[][] res;
if (len % paaSize == 0) {
res = reshape(vals, len / paaSize, paaSize);
}
else {
double[][] tmp = new double[paaSize][len];
for (int i = 0; i < paaSize; i++) {
for (int j = 0; j < len; j++) {
tmp[i][j] = vals[0][j];
}
}
double[][] expandedSS = reshape(tmp, 1, len * paaSize);
res = reshape(expandedSS, len, paaSize);
}
double[] newVals = colMeans(res);
return newVals;
}
}
/**
* Converts the vector into one-row matrix.
*
* @param vector The vector.
* @return The matrix.
*/
private double[][] asMatrix(double[] vector) {
double[][] res = new double[1][vector.length];
for (int i = 0; i < vector.length; i++) {
res[0][i] = vector[i];
}
return res;
}
/**
* Z-Normalize timeseries to the mean zero and standard deviation of one.
*
* @param series The timeseries.
* @return Z-normalized time-series.
* @throws TSException if error occurs.
*/
private double[] zNormalize(double[] series) throws TSException {
// this is the resulting normalization
//
double[] res = new double[series.length];
// get mean and sdev, NaN's will be handled
//
double mean = mean(series);
double sd = stDev(series);
// another special case, where SD happens to be close to a zero, i.e. they all are the same for
// example
//
if (sd <= 0.001D) {
// here I assign another magic value - 0.001D which makes to middle band of the normal
// Alphabet
//
for (int i = 0; i < res.length; i++) {
if (Double.isInfinite(series[i]) || Double.isNaN(series[i])) {
res[i] = series[i];
}
else {
res[i] = 0.1D;
}
}
}
// normal case, everything seems to be fine
//
else {
// sd and mean here, - go-go-go
for (int i = 0; i < res.length; i++) {
res[i] = (series[i] - mean) / sd;
}
}
return res;
}
/**
* Computes the mean value of timeseries.
*
* @param series The timeseries.
* @return The mean value.
*/
private double mean(double[] series) {
double res = 0D;
int count = 0;
for (double tp : series) {
if (Double.isNaN(tp) || Double.isInfinite(tp)) {
continue;
}
else {
res += tp;
count += 1;
}
}
if (count > 0) {
return res / ((Integer) count).doubleValue();
}
return Double.NaN;
}
/**
* Computes the standard deviation of timeseries.
*
* @param series The timeseries.
* @return the standard deviation.
*/
private double stDev(double[] series) {
double num0 = 0D;
double sum = 0D;
int count = 0;
for (double tp : series) {
if (Double.isNaN(tp) || Double.isInfinite(tp)) {
continue;
}
else {
num0 = num0 + tp * tp;
sum = sum + tp;
count += 1;
}
}
if (count > 0) {
double len = ((Integer) count).doubleValue();
return Math.sqrt((len * num0 - sum * sum) / (len * (len - 1)));
}
return Double.NaN;
}
/**
* Extract subseries out of series.
*
* @param series The series array.
* @param start Start position
* @param length Length of subseries to extract.
* @return The subseries.
* @throws IndexOutOfBoundsException If error occurs.
*/
private double[] subseries(double[] series, int start, int length)
throws IndexOutOfBoundsException {
if (start + length > series.length) {
throw new IndexOutOfBoundsException("Unable to extract subseries, series length: "
+ series.length + ", start: " + start + ", subseries length: " + length);
}
double[] res = new double[length];
for (int i = 0; i < length; i++) {
res[i] = series[start + i];
}
return res;
}
/**
* Converts the timeseries into string using given cuts intervals. Useful for not-normal
* distribution cuts.
*
* @param vals The timeseries.
* @param cuts The cut intervals.
* @return The timeseries SAX representation.
*/
private char[] ts2String(double[] vals, double[] cuts) {
char[] res = new char[vals.length];
for (int i = 0; i < vals.length; i++) {
res[i] = num2char(vals[i], cuts);
}
return res;
}
/**
* Get mapping of a number to char.
*
* @param value the value to map.
* @param cuts the array of intervals.
* @return character corresponding to numeric value.
*/
private char num2char(double value, double[] cuts) {
int count = 0;
while ((count < cuts.length) && (cuts[count] <= value)) {
count++;
}
return ALPHABET[count];
}
protected static String toLogStr(int[] params, SAXCollectionStrategy strategy, double accuracy,
double error) {
StringBuffer sb = new StringBuffer();
if (strategy.equals(SAXCollectionStrategy.CLASSIC)) {
sb.append("CLASSIC,");
}
else if (strategy.equals(SAXCollectionStrategy.EXACT)) {
sb.append("EXACT,");
}
else if (strategy.equals(SAXCollectionStrategy.NOREDUCTION)) {
sb.append("NOREDUCTION,");
}
sb.append(params[0]).append(COMMA);
sb.append(params[1]).append(COMMA);
sb.append(params[2]).append(COMMA);
sb.append(accuracy).append(COMMA);
sb.append(error);
return sb.toString();
}
/**
* Mimics Matlab function for reshape: returns the m-by-n matrix B whose elements are taken
* column-wise from A. An error results if A does not have m*n elements.
*
* @param a the source matrix.
* @param n number of rows in the new matrix.
* @param m number of columns in the new matrix.
*
* @return reshaped matrix.
*/
public static double[][] reshape(double[][] a, int n, int m) {
int cEl = 0;
int aRows = a.length;
double[][] res = new double[n][m];
for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
res[i][j] = a[cEl % aRows][cEl / aRows];
cEl++;
}
}
return res;
}
/**
* Computes column means for the matrix.
*
* @param a the input matrix.
* @return result.
*/
public static double[] colMeans(double[][] a) {
double[] res = new double[a[0].length];
for (int j = 0; j < a[0].length; j++) {
double sum = 0;
int counter = 0;
for (int i = 0; i < a.length; i++) {
if (Double.isNaN(a[i][j]) || Double.isInfinite(a[i][j])) {
continue;
}
sum += a[i][j];
counter += 1;
}
if (counter == 0) {
res[j] = Double.NaN;
}
else {
res[j] = sum / ((Integer) counter).doubleValue();
}
}
return res;
}
}