package edu.hawaii.jmotif.performance; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.Stack; import java.util.TreeSet; import java.util.concurrent.Callable; import java.util.concurrent.atomic.AtomicInteger; import org.hackystat.utilities.stacktrace.StackTrace; import edu.hawaii.jmotif.algorithm.MatrixFactory; import edu.hawaii.jmotif.sax.SAXFactory; import edu.hawaii.jmotif.sax.alphabet.Alphabet; import edu.hawaii.jmotif.sax.alphabet.NormalAlphabet; import edu.hawaii.jmotif.text.SAXCollectionStrategy; import edu.hawaii.jmotif.text.WordBag; import edu.hawaii.jmotif.timeseries.TSException; public class UCRKNNloocvJob implements Callable<String> { /** The latin alphabet, lower case letters a-z. */ private static final char[] ALPHABET = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; private static final String COMMA = ","; private Map<String, List<double[]>> trainData; private int validationSampleSize; private int windowSize; private int paaSize; private int alphabetSize; private SAXCollectionStrategy strategy; public UCRKNNloocvJob(Map<String, List<double[]>> trainData, int validationSampleSize, int windowSize, int paaSize, int alphabetSize, SAXCollectionStrategy strategy) { this.trainData = trainData; this.validationSampleSize = validationSampleSize; this.windowSize = windowSize; this.paaSize = paaSize; this.alphabetSize = alphabetSize; this.strategy = strategy; } @Override public String call() throws Exception { try { // parameters int[][] params = new int[1][4]; params[0][0] = windowSize; params[0][1] = paaSize; params[0][2] = alphabetSize; params[0][3] = strategy.index(); // push into stack all the samples we are going to validate for Stack<KNNOptimizedStackEntry> samples2go = new Stack<KNNOptimizedStackEntry>(); for (Entry<String, List<double[]>> e : trainData.entrySet()) { String key = e.getKey(); int index = 0; for (double[] sample : e.getValue()) { samples2go.push(new KNNOptimizedStackEntry(key, sample, index)); index++; } } // System.out.println("pushed to LOOCV stack " + samples2go.size() + " series"); // total counter int totalSamples = samples2go.size(); // missclassified counter int missclassifiedSamples = 0; // cache for bags HashMap<String, WordBag> cache = new HashMap<String, WordBag>(); // while something in stack while (!samples2go.isEmpty()) { // extracting validation samples // List<KNNOptimizedStackEntry> currentValidationSample = new ArrayList<KNNOptimizedStackEntry>(); Set<Integer> currentValidationIndexes = new TreeSet<Integer>(); for (int i = 0; i < this.validationSampleSize; i++) { if (samples2go.isEmpty()) { break; } KNNOptimizedStackEntry sample = samples2go.pop(); String cKey = sample.getKey(); if (i > 0) { // this is to avoid getting getting into cross-validation sample an item of different // class if (!(cKey.equalsIgnoreCase(currentValidationSample.get(i - 1).getKey()))) { samples2go.push(sample); break; } } currentValidationSample.add(sample); currentValidationIndexes.add(sample.getIndex()); } // System.out.println("currentValidationKey: " + currentValidationSample.get(0).getKey()); // System.out.println("currentValidationSeries: " + // Arrays.toString(currentValidationSample.get(0).getValue())); // System.out.println("currentValidationIndexes: " + // currentValidationIndexes.iterator().next()); // check if something in the validation sample // if (currentValidationSample.isEmpty()) { throw new RuntimeException("An empty validation sample in LOOCV job!"); } String validationKey = currentValidationSample.get(0).getKey(); // re-build bags if there is a need or pop them from the stack // for (Entry<String, List<double[]>> e : trainData.entrySet()) { // check here if the validation key is the same as the training class key // if there is a hit - need to rebuild that corresponding bag and replace it in the cache // structure which save computation if (e.getKey().equalsIgnoreCase(validationKey)) { WordBag bag = new WordBag(validationKey); // System.out.println("(re) building a bag for the key: " + e.getKey() + // " for cache placement"); int index = -1; for (double[] series : e.getValue()) { index++; if (currentValidationIndexes.contains(index)) { // if (sampleContainsSeries(currentValidationSample, series)) { // System.out.println("bingo! "); // } // else { // System.out.println("Wrong! "); // System.exit(10); // } // System.out.println("avoiding the placement of the index " + index); continue; } WordBag cb = seriesToWordBag("tmp", series, params, strategy); bag.mergeWith(cb); } cache.put(validationKey, bag); } // else we just check if a bag is in place, if not - we put it in else { if (!cache.containsKey(e.getKey())) { // System.out.println("building a bag for the key: " + e.getKey()); WordBag bag = new WordBag(e.getKey()); for (double[] series : e.getValue()) { WordBag cb = seriesToWordBag("tmp", series, params, strategy); bag.mergeWith(cb); } cache.put(e.getKey(), bag); } } } // end of cache update loop // all stuff from the cache will build a classifier vectors // // compute TFIDF statistics for training set HashMap<String, HashMap<String, Double>> tfidf = computeTFIDF(cache.values()); // normalize to unit vectors to avoid false discrimination by vector magnitude // tfidf = normalizeToUnitVectors(tfidf); // Classifying... // // is this sample correctly classified? for (KNNOptimizedStackEntry e : currentValidationSample) { String res = classify(e.getKey(), e.getValue(), tfidf, params, strategy); if ("ok".equalsIgnoreCase(res)) { assert true; } else { missclassifiedSamples = missclassifiedSamples + 1; // if (e.getKey().equalsIgnoreCase(res)) { // System.out.println("gotcha"); // } // System.out.println("missclassified: " + e.getKey() + ": as " + res + ": " // + Arrays.toString(e.getValue()).substring(1, 40) + "..."); } } } double error = Integer.valueOf(missclassifiedSamples).doubleValue() / Integer.valueOf(totalSamples).doubleValue(); String res = "ok_" + toLogStr(params, strategy, 1.0D - error, error); return res; } catch (Exception e) { System.err.println("Exception caught: " + StackTrace.toString(e)); return StackTrace.toString(e); } } private String toLogStr(int[][] params, SAXCollectionStrategy strategy, double accuracy, double error) { StringBuffer sb = new StringBuffer(); if (strategy.equals(SAXCollectionStrategy.CLASSIC)) { sb.append("CLASSIC,"); } else if (strategy.equals(SAXCollectionStrategy.EXACT)) { sb.append("EXACT,"); } else if (strategy.equals(SAXCollectionStrategy.NOREDUCTION)) { sb.append("NOREDUCTION,"); } sb.append(params[0][0]).append(COMMA); sb.append(params[0][1]).append(COMMA); sb.append(params[0][2]).append(COMMA); sb.append(accuracy).append(COMMA); sb.append(error); return sb.toString(); } /** * Computes TF*IDF values. * * @param texts The collection of text documents for which the statistics need to be computed. * @return The map of source documents names to the word - tf*idf weight collections. */ private HashMap<String, HashMap<String, Double>> computeTFIDF(Collection<WordBag> texts) { // the number of docs int totalDocs = texts.size(); // the result. map of document names to the pairs word - tfidf weight HashMap<String, HashMap<String, Double>> res = new HashMap<String, HashMap<String, Double>>(); // build a collection of all observed words and their frequency in corpus HashMap<String, AtomicInteger> allWords = new HashMap<String, AtomicInteger>(); for (WordBag bag : texts) { // here populate result map with empty entries res.put(bag.getLabel(), new HashMap<String, Double>()); // and get those words for (Entry<String, AtomicInteger> e : bag.getInternalWords().entrySet()) { if (allWords.containsKey(e.getKey())) { allWords.get(e.getKey()).incrementAndGet(); } else { allWords.put(e.getKey(), new AtomicInteger(1)); } } } // outer loop - iterating over documents for (WordBag bag : texts) { // fix the doc name String bagName = bag.getLabel(); HashMap<String, AtomicInteger> bagWords = bag.getInternalWords(); // these are words of // documents // what we want to do for TF*IDF is to compute it for all WORDS ever seen in set // for (Entry<String, AtomicInteger> word : allWords.entrySet()) { // by default it is zero // double tfidf = 0; // if this document contains the word - here we go if (bagWords.containsKey(word.getKey()) & (totalDocs != word.getValue().intValue())) { int wordInBagFrequency = bagWords.get(word.getKey()).intValue(); // compute TF: we take a log and correct for 0 by adding 1 // double tfValue = Math.log(1.0D + Integer.valueOf(wordInBagFrequency).doubleValue()); // double tfValue = 1.0D + Math.log(Integer.valueOf(wordInBagFrequency).doubleValue()); // double tfValue = normalizedTF(bag, word.getKey()); // double tfValue = augmentedTF(bag, word.getKey()); double tfValue = logAveTF(bag, word.getKey()); // compute the IDF // double idfLOGValue = Math.log10(Integer.valueOf(totalDocs).doubleValue() / word.getValue().doubleValue()); // and the TF-IDF // tfidf = tfValue * idfLOGValue; } res.get(bagName).put(word.getKey(), tfidf); } } return res; } /** * Computes a cosine normalization of TFIDF statistics. * * @param data The data. * @return The normalized tfidf statistics. */ private HashMap<String, HashMap<String, Double>> normalizeToUnitVectors( HashMap<String, HashMap<String, Double>> data) { // result HashMap<String, HashMap<String, Double>> res = new HashMap<String, HashMap<String, Double>>(); // cosine normalize these rows corresponding to docs TFIDF // for (Entry<String, HashMap<String, Double>> e : data.entrySet()) { double sum = 0D; for (double el : e.getValue().values()) { if (!(0. == el)) { sum = sum + el * el; } } double sqRoot = Math.sqrt(sum); // // here is normalization coefficient is calculated - all the elements must be divided by its // value HashMap<String, Double> newEntry = new HashMap<String, Double>(e.getValue().size()); for (Entry<String, Double> val : e.getValue().entrySet()) { if (val.getValue().equals(0D)) { newEntry.put(val.getKey(), 0D); } else { newEntry.put(val.getKey(), val.getValue() / sqRoot); } } res.put(e.getKey(), newEntry); } return res; } private String classify(String classKey, double[] series, HashMap<String, HashMap<String, Double>> tfidf, int[][] params, SAXCollectionStrategy strategy) throws IndexOutOfBoundsException, TSException { WordBag test = seriesToWordBag("test", series, params, strategy); // System.out.println("bag: " + test.toString()); double minDist = -1.0d; String className = ""; double[] cosines = new double[tfidf.entrySet().size()]; int index = 0; for (Entry<String, HashMap<String, Double>> e : tfidf.entrySet()) { double dist = cosineSimilarity(test, e.getValue()); cosines[index] = dist; index++; if (dist > minDist) { className = e.getKey(); minDist = dist; } } boolean allEqual = true; double cosine = cosines[0]; for (int i = 1; i < cosines.length; i++) { if (!(cosines[i] == cosine)) { allEqual = false; } } if (!(allEqual) && className.equalsIgnoreCase(classKey)) { return "ok"; } // System.out.println("All equal? " + allEqual); return className; } private WordBag seriesToWordBag(String label, double[] series, int[][] params, SAXCollectionStrategy strategy) throws IndexOutOfBoundsException, TSException { Alphabet a = new NormalAlphabet(); WordBag resultBag = new WordBag(label); for (int[] p : params) { int windowSize = p[0]; int paaSize = p[1]; int alphabetSize = p[2]; String oldStr = ""; for (int i = 0; i <= series.length - windowSize; i++) { double[] paa = paa(zNormalize(subseries(series, i, windowSize)), paaSize); char[] sax = ts2String(paa, a.getCuts(alphabetSize)); if (SAXCollectionStrategy.CLASSIC.equals(strategy)) { if (oldStr.length() > 0 && SAXFactory.strDistance(sax, oldStr.toCharArray()) == 0) { continue; } } else if (SAXCollectionStrategy.EXACT.equals(strategy)) { if (oldStr.equalsIgnoreCase(String.valueOf(sax))) { continue; } } oldStr = String.valueOf(sax); resultBag.addWord(String.valueOf(sax)); } } return resultBag; } private double cosineSimilarity(WordBag testSample, HashMap<String, Double> weightVector) { double res = 0; for (Entry<String, Integer> entry : testSample.getWords().entrySet()) { if (weightVector.containsKey(entry.getKey())) { res = res + entry.getValue().doubleValue() * weightVector.get(entry.getKey()).doubleValue(); } } double m1 = magnitude(testSample.getWordsAsDoubles().values()); double m2 = magnitude(weightVector.values()); return res / (m1 * m2); } private double magnitude(Collection<Double> values) { double res = 0.0D; for (Double v : values) { res = res + v * v; } return Math.sqrt(res); } /** * Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as * follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there * are some (more or equal one) values happened to be in the piece - algorithm will handle it as * usual - getting the mean. * * @param ts The timeseries to approximate. * @param paaSize The desired length of approximated timeseries. * @return PAA-approximated timeseries. * @throws TSException if error occurs. */ private double[] paa(double[] ts, int paaSize) throws TSException { // fix the length int len = ts.length; // check for the trivial case if (len == paaSize) { return Arrays.copyOf(ts, ts.length); } else { if (len % paaSize == 0) { return MatrixFactory.colMeans(MatrixFactory.reshape(asMatrix(ts), len / paaSize, paaSize)); } else { // res = new double[len][paaSize]; // for (int j = 0; j < len; j++) { // for (int i = 0; i < paaSize; i++) { // int idx = j * paaSize + i; // int row = idx % len; // int col = idx / len; // res[row][col] = ts[j]; // } // } double[] paa = new double[paaSize]; for (int i = 0; i < len * paaSize; i++) { int idx = i / len; // the spot int pos = i / paaSize; // the col spot paa[idx] = paa[idx] + ts[pos]; } for (int i = 0; i < paaSize; i++) { paa[i] = paa[i] / (double) len; } return paa; } } } /** * Mimics Matlab function for reshape: returns the m-by-n matrix B whose elements are taken * column-wise from A. An error results if A does not have m*n elements. * * @param a the source matrix. * @param n number of rows in the new matrix. * @param m number of columns in the new matrix. * * @return reshaped matrix. */ private double[][] reshape(double[][] a, int n, int m) { int cEl = 0; int aRows = a.length; double[][] res = new double[n][m]; for (int j = 0; j < m; j++) { for (int i = 0; i < n; i++) { res[i][j] = a[cEl % aRows][cEl / aRows]; cEl++; } } return res; } /** * Computes column means for the matrix. * * @param a the input matrix. * @return result. */ private double[] colMeans(double[][] a) { double[] res = new double[a[0].length]; for (int j = 0; j < a[0].length; j++) { double sum = 0; for (int i = 0; i < a.length; i++) { sum += a[i][j]; } // res[j] = sum / ((Integer) a.length).doubleValue(); res[j] = sum / ((double) a.length); } return res; } /** * Converts the vector into one-row matrix. * * @param vector The vector. * @return The matrix. */ private double[][] asMatrix(double[] vector) { double[][] res = new double[1][vector.length]; for (int i = 0; i < vector.length; i++) { res[0][i] = vector[i]; } return res; } /** * Z-Normalize timeseries to the mean zero and standard deviation of one. * * @param series The timeseries. * @return Z-normalized time-series. * @throws TSException if error occurs. */ private double[] zNormalize(double[] series) throws TSException { // this is the resulting normalization // double[] res = new double[series.length]; // get mean and sdev, NaN's will be handled // double mean = mean(series); double sd = stDev(series); // another special case, where SD happens to be close to a zero, i.e. they all are the same for // example // if (sd <= 0.001D) { // here I assign another magic value - 0.001D which makes to middle band of the normal // Alphabet // for (int i = 0; i < res.length; i++) { if (Double.isInfinite(series[i]) || Double.isNaN(series[i])) { res[i] = series[i]; } else { res[i] = 0.1D; } } } // normal case, everything seems to be fine // else { // sd and mean here, - go-go-go for (int i = 0; i < res.length; i++) { res[i] = (series[i] - mean) / sd; } } return res; } /** * Computes the mean value of timeseries. * * @param series The timeseries. * @return The mean value. */ private double mean(double[] series) { double res = 0D; int count = 0; for (double tp : series) { if (Double.isNaN(tp) || Double.isInfinite(tp)) { continue; } else { res += tp; count += 1; } } if (count > 0) { return res / ((Integer) count).doubleValue(); } return Double.NaN; } /** * Computes the standard deviation of timeseries. * * @param series The timeseries. * @return the standard deviation. */ private double stDev(double[] series) { double num0 = 0D; double sum = 0D; int count = 0; for (double tp : series) { if (Double.isNaN(tp) || Double.isInfinite(tp)) { continue; } else { num0 = num0 + tp * tp; sum = sum + tp; count += 1; } } if (count > 0) { double len = ((Integer) count).doubleValue(); return Math.sqrt((len * num0 - sum * sum) / (len * (len - 1))); } return Double.NaN; } /** * Extract subseries out of series. * * @param series The series array. * @param start Start position * @param length Length of subseries to extract. * @return The subseries. * @throws IndexOutOfBoundsException If error occurs. */ private double[] subseries(double[] series, int start, int length) throws IndexOutOfBoundsException { if (start + length > series.length) { throw new IndexOutOfBoundsException("Unable to extract subseries, series length: " + series.length + ", start: " + start + ", subseries length: " + length); } double[] res = new double[length]; for (int i = 0; i < length; i++) { res[i] = series[start + i]; } return res; } /** * Converts the timeseries into string using given cuts intervals. Useful for not-normal * distribution cuts. * * @param vals The timeseries. * @param cuts The cut intervals. * @return The timeseries SAX representation. */ private char[] ts2String(double[] vals, double[] cuts) { char[] res = new char[vals.length]; for (int i = 0; i < vals.length; i++) { res[i] = num2char(vals[i], cuts); } return res; } /** * Get mapping of a number to char. * * @param value the value to map. * @param cuts the array of intervals. * @return character corresponding to numeric value. */ private char num2char(double value, double[] cuts) { int count = 0; while ((count < cuts.length) && (cuts[count] <= value)) { count++; } return ALPHABET[count]; } /** * Compute TF (term frequency) metrics. This is normalized TF without bias towards longer * documents. * * @param bag The words bag. * @param term The term. * @return The term frequency value. */ private double normalizedTF(WordBag bag, String term) { if (bag.contains(term)) { return Integer.valueOf(bag.getWordFrequency(term)).doubleValue() / Integer.valueOf(bag.getMaxFrequency()).doubleValue(); } return 0; } /** * Compute TF (term frequency) metrics. This is normalized TF without bias towards longer * documents. * * @param bag The words bag. * @param term The term. * @return The term frequency value. */ private double augmentedTF(WordBag bag, String term) { if (bag.contains(term)) { return 0.5D + (Integer.valueOf(bag.getWordFrequency(term)).doubleValue()) / (2.0D * Integer.valueOf(bag.getMaxFrequency()).doubleValue()); } return 0; } /** * Compute TF (term frequency) metrics. This is normalized TF without bias towards longer * documents. * * @param bag The words bag. * @param term The term. * @return The term frequency value. */ private double logAveTF(WordBag bag, String term) { if (bag.contains(term)) { return (1D + Math.log(Integer.valueOf(bag.getWordFrequency(term)).doubleValue())) / (1D + Math.log(bag.getAverageFrequency())); } return 0; } }