package edu.hawaii.jmotif.timeseries; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.text.NumberFormat; import java.util.Arrays; import edu.hawaii.jmotif.algorithm.MatrixFactory; import edu.hawaii.jmotif.sax.alphabet.Alphabet; /** * Implements algorithms for low-level data manipulation. * * @author Pavel Senin * */ public final class TSUtils { /** The latin alphabet, lower case letters a-z. */ static final char[] ALPHABET = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; /** * Constructor. */ private TSUtils() { super(); } /** * Reads timeseries from a file. Assumes that file has a single double value on every line. * Assigned timestamps are the line numbers. * * @param filename The file to read from. * @param sizeLimit The number of lines to read. * @return Timeseries. * @throws NumberFormatException if error occurs. * @throws IOException if error occurs. * @throws TSException if error occurs. */ public static Timeseries readTS(String filename, int sizeLimit) throws NumberFormatException, IOException, TSException { BufferedReader br = new BufferedReader(new FileReader(new File(filename))); String line = null; double[] values = new double[sizeLimit]; long[] tstamps = new long[sizeLimit]; int i = 0; while ((line = br.readLine()) != null) { values[i] = Double.valueOf(line); tstamps[i] = (long) i; i++; } br.close(); return new Timeseries(values, tstamps); } /** * Finds the maximal value in timeseries. * * @param series The timeseries. * @return The max value. */ public static double max(Timeseries series) { if (countNaN(series) == series.size()) { return Double.NaN; } double[] values = series.values(); double max = Double.MIN_VALUE; for (int i = 0; i < values.length; i++) { if (max < values[i]) { max = values[i]; } } return max; } /** * Finds the maximal value in timeseries. * * @param series The timeseries. * @return The max value. */ public static double max(double[] series) { if (countNaN(series) == series.length) { return Double.NaN; } double max = Double.MIN_VALUE; for (int i = 0; i < series.length; i++) { if (max < series[i]) { max = series[i]; } } return max; } /** * Finds the minimal value in timeseries. * * @param series The timeseries. * @return The min value. */ public static double min(Timeseries series) { if (countNaN(series) == series.size()) { return Double.NaN; } double[] values = series.values(); double min = Double.MAX_VALUE; for (int i = 0; i < values.length; i++) { if (min > values[i]) { min = values[i]; } } return min; } /** * Finds the minimal value in timeseries. * * @param series The timeseries. * @return The min value. */ public static double min(double[] series) { if (countNaN(series) == series.length) { return Double.NaN; } double min = Double.MAX_VALUE; for (int i = 0; i < series.length; i++) { if (min > series[i]) { min = series[i]; } } return min; } /** * Computes the mean value of timeseries. * * @param series The timeseries. * @return The mean value. */ public static double mean(Timeseries series) { double res = 0D; int count = 0; for (TPoint tp : series) { if (Double.isNaN(tp.value()) || Double.isInfinite(tp.value())) { continue; } else { res += tp.value(); count += 1; } } if (count > 0) { return res / ((Integer) count).doubleValue(); } return Double.NaN; } /** * Computes the mean value of timeseries. * * @param series The timeseries. * @return The mean value. */ public static double mean(double[] series) { double res = 0D; int count = 0; for (double tp : series) { if (Double.isNaN(tp) || Double.isInfinite(tp)) { continue; } else { res += tp; count += 1; } } if (count > 0) { return res / ((Integer) count).doubleValue(); } return Double.NaN; } /** * Compute the variance of timeseries. * * @param series The timeseries. * @return The variance. */ public static double var(Timeseries series) { double res = 0D; double mean = mean(series); if (Double.isNaN(mean) || Double.isInfinite(mean)) { return Double.NaN; } int count = 0; for (TPoint tp : series) { if (Double.isNaN(tp.value()) || Double.isInfinite(tp.value())) { continue; } else { res += (tp.value() - mean) * (tp.value() - mean); count += 1; } } if (count > 0) { return res / ((Integer) (count - 1)).doubleValue(); } return Double.NaN; } /** * Compute the variance of timeseries. * * @param series The timeseries. * @return The variance. */ public static double var(double[] series) { double res = 0D; double mean = mean(series); if (Double.isNaN(mean) || Double.isInfinite(mean)) { return Double.NaN; } int count = 0; for (double tp : series) { if (Double.isNaN(tp) || Double.isInfinite(tp)) { continue; } else { res += (tp - mean) * (tp - mean); count += 1; } } if (count > 0) { return res / ((Integer) (count - 1)).doubleValue(); } return Double.NaN; } /** * Computes the standard deviation of timeseries. * * @param series The timeseries. * @return the standard deviation. */ public static double stDev(Timeseries series) { double num0 = 0D; double sum = 0D; int count = 0; for (TPoint tp : series) { if (Double.isNaN(tp.value()) || Double.isInfinite(tp.value())) { continue; } else { num0 = num0 + tp.value() * tp.value(); sum = sum + tp.value(); count += 1; } } if (count > 0) { double len = ((Integer) count).doubleValue(); return Math.sqrt((len * num0 - sum * sum) / (len * (len - 1))); } return Double.NaN; } /** * Computes the standard deviation of timeseries. * * @param series The timeseries. * @return the standard deviation. */ public static double stDev(double[] series) { double num0 = 0D; double sum = 0D; int count = 0; for (double tp : series) { if (Double.isNaN(tp) || Double.isInfinite(tp)) { continue; } else { num0 = num0 + tp * tp; sum = sum + tp; count += 1; } } if (count > 0) { double len = ((Integer) count).doubleValue(); return Math.sqrt((len * num0 - sum * sum) / (len * (len - 1))); } return Double.NaN; } /** * Z-Normalize timeseries to the mean zero and standard deviation of one. * * @param series The timeseries. * @return Z-normalized time-series. * @throws TSException if error occurs. * @throws CloneNotSupportedException */ public static Timeseries zNormalize(Timeseries series) throws TSException, CloneNotSupportedException { // resulting series // Timeseries res = series.clone(); // here I will extract doubles and normailize those, joining together later // double seriesValues[] = new double[series.size()]; int idx = 0; for (TPoint p : series) { seriesValues[idx] = p.value(); idx++; } double[] normalValues = zNormalize(seriesValues); // get things together // idx = 0; for (TPoint p : res) { p.setValue(normalValues[idx]); idx++; } return res; } /** * Z-Normalize timeseries to the mean zero and standard deviation of one. * * @param series The timeseries. * @return Z-normalized time-series. * @throws TSException if error occurs. */ public static double[] zNormalize(double[] series) throws TSException { // this is the resulting normalization // double[] res = new double[series.length]; // get mean and sdev, NaN's will be handled // double mean = mean(series); double sd = stDev(series); // check if we hit special case, where something got NaN // if (Double.isInfinite(mean) || Double.isNaN(mean) || Double.isInfinite(sd) || Double.isNaN(sd)) { // case[1] single value within the timeseries, normalize this value to 1.0 - magic number // int nanNum = countNaN(series); if ((series.length - nanNum) == 1) { for (int i = 0; i < res.length; i++) { if (Double.isInfinite(series[i]) || Double.isNaN(series[i])) { res[i] = Double.NaN; } else { res[i] = 1.0D; } } } // case[2] all values are NaN's // else if (series.length == nanNum) { for (int i = 0; i < res.length; i++) { res[i] = Double.NaN; } } } // another special case, where SD happens to be close to a zero, i.e. they all are the same for // example // else if (sd <= 0.001D) { // here I assign another magic value - 0.001D which makes to middle band of the normal // Alphabet // for (int i = 0; i < res.length; i++) { if (Double.isInfinite(series[i]) || Double.isNaN(series[i])) { res[i] = series[i]; } else { res[i] = 0.1D; } } } // normal case, everything seems to be fine // else { // sd and mean here, - go-go-go for (int i = 0; i < res.length; i++) { res[i] = (series[i] - mean) / sd; } } return res; } /** * Z-Normalize timeseries to the mean zero and standard deviation of one. * * @param series The timeseries. * @param mean The mean values. * @param sd The standard deviation. * @return Z-normalized time-series. * @throws TSException if error occurs. */ public static double[] zNormalize(double[] series, double mean, double sd) throws TSException { // this is the resulting normalization // double[] res = new double[series.length]; // check if we hit special case, where something got NaN // if (Double.isInfinite(mean) || Double.isNaN(mean) || Double.isInfinite(sd) || Double.isNaN(sd)) { // case[1] single value within the timeseries, normalize this value to 1.0 - magic number // int nanNum = countNaN(series); if ((series.length - nanNum) == 1) { for (int i = 0; i < res.length; i++) { if (Double.isInfinite(series[i]) || Double.isNaN(series[i])) { res[i] = Double.NaN; } else { res[i] = 1.0D; } } } // case[2] all values are NaN's // else if (series.length == nanNum) { for (int i = 0; i < res.length; i++) { res[i] = Double.NaN; } } } // another special case, where SD happens to be close to a zero, i.e. they all are the same for // example // else if (sd <= 0.001D) { // here I assign another magic value - 0.001D which makes to middle band of the normal // Alphabet // for (int i = 0; i < res.length; i++) { if (Double.isInfinite(series[i]) || Double.isNaN(series[i])) { res[i] = series[i]; } else { res[i] = 0.1D; } } } // normal case, everything seems to be fine // else { // sd and mean here, - go-go-go for (int i = 0; i < res.length; i++) { res[i] = (series[i] - mean) / sd; } } return res; } /** * Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as * follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there * are some (more or equal one) values happened to be in the piece - algorithm will handle it as * usual - getting the mean. * * @param ts The timeseries to approximate. * @param paaSize The desired length of approximated timeseries. * @return PAA-approximated timeseries. * @throws TSException if error occurs. * @throws CloneNotSupportedException if error occurs. */ public static Timeseries paa(Timeseries ts, int paaSize) throws TSException, CloneNotSupportedException { // fix the length int len = ts.size(); // check for the trivial case if (len == paaSize) { return ts.clone(); } else { // get values and timestamps double[][] vals = ts.valuesAsMatrix(); long[] tStamps = ts.tstamps(); // work out PAA by reshaping arrays double[][] res; if (len % paaSize == 0) { res = MatrixFactory.reshape(vals, len / paaSize, paaSize); } else { double[][] tmp = new double[paaSize][len]; // System.out.println(Matrix.toString(tmp)); for (int i = 0; i < paaSize; i++) { for (int j = 0; j < len; j++) { tmp[i][j] = vals[0][j]; } } // System.out.println(Matrix.toString(tmp)); double[][] expandedSS = MatrixFactory.reshape(tmp, 1, len * paaSize); // System.out.println(Matrix.toString(expandedSS)); res = MatrixFactory.reshape(expandedSS, len, paaSize); // System.out.println(Matrix.toString(res)); } // // now, here is a new trick comes in game - because we have so many // "lost" values // PAA game rules will change - we will omit NaN values and put NaNs // back to PAA series // // // this is the old line of code here: // double[] newVals = MatrixFactory.colMeans(res); // // i will need to test this though // // double[] newVals = MatrixFactory.colMeans(res); // work out timestamps long start = tStamps[0]; long interval = tStamps[len - 1] - start; double increment = Long.valueOf(interval).doubleValue() / Long.valueOf(paaSize).doubleValue(); long[] newTstamps = new long[paaSize]; for (int i = 0; i < paaSize; i++) { newTstamps[i] = start + Double.valueOf(increment / 2.0D + i * increment).longValue(); } return new Timeseries(newVals, newTstamps); } } /** * Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as * follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there * are some (more or equal one) values happened to be in the piece - algorithm will handle it as * usual - getting the mean. * * @param ts The timeseries to approximate. * @param paaSize The desired length of approximated timeseries. * @return PAA-approximated timeseries. * @throws TSException if error occurs. */ public static double[] paa(double[] ts, int paaSize) throws TSException { // fix the length int len = ts.length; // check for the trivial case if (len == paaSize) { return Arrays.copyOf(ts, ts.length); } else { // get values and timestamps double[][] vals = asMatrix(ts); // work out PAA by reshaping arrays double[][] res; if (len % paaSize == 0) { res = MatrixFactory.reshape(vals, len / paaSize, paaSize); } else { double[][] tmp = new double[paaSize][len]; for (int i = 0; i < paaSize; i++) { for (int j = 0; j < len; j++) { tmp[i][j] = vals[0][j]; } } double[][] expandedSS = MatrixFactory.reshape(tmp, 1, len * paaSize); res = MatrixFactory.reshape(expandedSS, len, paaSize); } double[] newVals = MatrixFactory.colMeans(res); return newVals; } } /** * Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as * follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there * are some (more or equal one) values happened to be in the piece - algorithm will handle it as * usual - getting the mean. * * @param ts The timeseries to approximate. * @param paaSize The desired length of approximated timeseries. * @return PAA-approximated timeseries. * @throws TSException if error occurs. */ public static double[] optimizedPaa(double[] ts, int paaSize) throws TSException { // fix the length int len = ts.length; // check for the trivial case if (len == paaSize) { return Arrays.copyOf(ts, ts.length); } else { if (len % paaSize == 0) { return MatrixFactory.colMeans(MatrixFactory.reshape(asMatrix(ts), len / paaSize, paaSize)); } else { // res = new double[len][paaSize]; // for (int j = 0; j < len; j++) { // for (int i = 0; i < paaSize; i++) { // int idx = j * paaSize + i; // int row = idx % len; // int col = idx / len; // res[row][col] = ts[j]; // } // } double[] paa = new double[paaSize]; for (int i = 0; i < len * paaSize; i++) { int idx = i / len; // the spot int pos = i / paaSize; // the col spot paa[idx] = paa[idx] + ts[pos]; } for (int i = 0; i < paaSize; i++) { paa[i] = paa[i] / (double) len; } return paa; } } } public static double[] paaZeroNA(double[] ts, int paaSize) { for (int i = 0; i < ts.length; i++) { if (ts[i] == 0) { ts[i] = Double.NaN; } } // fix the length int len = ts.length; // check for the trivial case if (len == paaSize) { return Arrays.copyOf(ts, ts.length); } else { if (len % paaSize == 0) { return MatrixFactory.colMeans(MatrixFactory.reshape(asMatrix(ts), len / paaSize, paaSize)); } else { // res = new double[len][paaSize]; // for (int j = 0; j < len; j++) { // for (int i = 0; i < paaSize; i++) { // int idx = j * paaSize + i; // int row = idx % len; // int col = idx / len; // res[row][col] = ts[j]; // } // } double[] paa = new double[paaSize]; short[] paaLen = new short[paaSize]; for (int i = 0; i < len * paaSize; i++) { int idx = i / len; // the spot int pos = i / paaSize; // the col spot if (0.0D == ts[pos]) { paa[idx] = paa[idx]; } else { paa[idx] = paa[idx] + ts[pos]; paaLen[idx]++; } } for (int i = 0; i < paaSize; i++) { if (paa[i] == 0 && paaLen[i] == 0) { paa[i] = Double.NaN; } paa[i] = paa[i] / (double) len; } return paa; } } } /** * Converts a timeseries into the string using alphabet cuts. * * @param series The timeseries to convert. * @param alphabet The alphabet to use. * @param alphabetSize The alphabet size. * @return Symbolic (SAX) representation of timeseries. * @throws TSException if error occurs. */ public static char[] ts2String(Timeseries series, Alphabet alphabet, int alphabetSize) throws TSException { double[] cuts = alphabet.getCuts(alphabetSize); char[] res = new char[series.size()]; for (int i = 0; i < series.size(); i++) { res[i] = num2char(series.elementAt(i).value(), cuts); } return res; } /** * Converts the timeseries into string using given cuts intervals. Useful for not-normal * distribution cuts. * * @param vals The timeseries. * @param cuts The cut intervals. * @return The timeseries SAX representation. */ public static char[] ts2String(double[] vals, double[] cuts) { char[] res = new char[vals.length]; for (int i = 0; i < vals.length; i++) { res[i] = num2char(vals[i], cuts); } return res; } /** * Converts a timeseries into the string paying attention to NaN values. * * @param series The timeseries to convert. * @param alphabet The alphabet to use. * @param alphabetSize The alphabet size in use. * @return SAX representation of timeseries. * @throws TSException if error occurs. */ public static char[] ts2StringWithNaN(Timeseries series, Alphabet alphabet, int alphabetSize) throws TSException { double[] cuts = alphabet.getCuts(alphabetSize); return ts2StringWithNaNByCuts(series, cuts); } /** * Converts a timeseries into the string paying attention to NaN values. * * @param series The timeseries to convert. * @param cuts The cuts for alphabet. * @return SAX representation of timeseries. * @throws TSException if error occurs. */ public static char[] ts2StringWithNaNByCuts(Timeseries series, double[] cuts) throws TSException { char[] res = new char[series.size()]; for (int i = 0; i < series.size(); i++) { if (Double.isNaN(series.elementAt(i).value()) || Double.isInfinite(series.elementAt(i).value())) { res[i] = '_'; } else { res[i] = num2char(series.elementAt(i).value(), cuts); } } return res; } /** * Get mapping of a number to char. * * @param value the value to map. * @param cuts the array of intervals. * @return character corresponding to numeric value. */ public static char num2char(double value, double[] cuts) { int count = 0; while ((count < cuts.length) && (cuts[count] <= value)) { count++; } return ALPHABET[count]; } /** * Converts index into char. * * @param idx The index value. * @return The char by index. */ public static char num2char(int idx) { return ALPHABET[idx]; } /** * Convert the timeseries into the index using SAX cuts. * * @param series The timeseries to convert. * @param alphabet The alphabet to use. * @param alphabetSize The alphabet size in use. * @return SAX representation of timeseries. * @throws TSException if error occurs. */ public static int[] ts2Index(Timeseries series, Alphabet alphabet, int alphabetSize) throws TSException { double[] cuts = alphabet.getCuts(alphabetSize); int[] res = new int[series.size()]; for (int i = 0; i < series.size(); i++) { res[i] = num2index(series.elementAt(i).value(), cuts); } return res; } /** * Get mapping of number to cut index. * * @param value the value to map. * @param cuts the array of intervals. * @return character corresponding to numeric value. */ public static int num2index(double value, double[] cuts) { int count = 0; while ((count < cuts.length) && (cuts[count] <= value)) { count++; } return count; } /** * Counts the number of NaNs' in the timeseries. * * @param series The timeseries. * @return The count of NaN values. */ public static int countNaN(double[] series) { int res = 0; for (double d : series) { if (Double.isInfinite(d) || Double.isNaN(d)) { res += 1; } } return res; } /** * Counts the number of NaNs' in the timeseries. * * @param series The timeseries. * @return The count of NaN values. */ private static int countNaN(Timeseries series) { int res = 0; for (TPoint tp : series) { if (Double.isInfinite(tp.value()) || Double.isNaN(tp.value())) { res += 1; } } return res; } /** * Converts the vector into one-row matrix. * * @param vector The vector. * @return The matrix. */ public static double[][] asMatrix(double[] vector) { double[][] res = new double[1][vector.length]; for (int i = 0; i < vector.length; i++) { res[0][i] = vector[i]; } return res; } /** * Extract subseries out of series. * * @param series The series array. * @param start Start position * @param length Length of subseries to extract. * @return The subseries. * @throws IndexOutOfBoundsException If error occurs. */ public static double[] subseries(double[] series, int start, int length) throws IndexOutOfBoundsException { if (start + length > series.length) { throw new IndexOutOfBoundsException("Unable to extract subseries, series length: " + series.length + ", start: " + start + ", subseries length: " + length); } double[] res = new double[length]; for (int i = 0; i < length; i++) { res[i] = series[start + i]; } return res; } /** * Extract subseries out of series. * * @param series The series array. * @param start Start position * @param length Length of subseries to extract. * @return The subseries. * @throws IndexOutOfBoundsException If error occurs. */ public static double[] subseries(Double[] series, int start, int length) throws IndexOutOfBoundsException { if (start + length > series.length) { throw new IndexOutOfBoundsException("Unable to extract subseries, series length: " + series.length + ", start: " + start + ", subseries length: " + length); } double[] res = new double[length]; for (int i = 0; i < length; i++) { res[i] = series[start + i]; } return res; } /** * Implements Gaussian smoothing. * * @param series Data to process. * @param filterWidth The filter width. * @return smoothed series. * @throws TSException if error occurs. */ public static double[] gaussFilter(double[] series, double filterWidth) throws TSException { double[] smoothedSignal = new double[series.length]; double sigma = filterWidth / 2D; int maxShift = (int) Math.floor(4D * sigma); // Gaussian curve is reasonably > 0 if (maxShift < 1) { throw new TSException("NOT smoothing: filter width too small - " + filterWidth); } for (int i = 0; i < smoothedSignal.length; i++) { smoothedSignal[i] = series[i]; if (maxShift < 1) { continue; } for (int j = 1; j <= maxShift; j++) { double gaussFilter = Math.exp(-(j * j) / (2. * sigma * sigma)); double leftAmpl, rightAmpl; // go left if ((i - j) >= 0) { leftAmpl = series[i - j]; } else { leftAmpl = series[i]; } // go right if ((i + j) <= smoothedSignal.length - 1) { rightAmpl = series[i + j]; } else { rightAmpl = series[i]; } smoothedSignal[i] += gaussFilter * (leftAmpl + rightAmpl); } double normalizingCoef = Math.sqrt(2. * Math.PI) * sigma; smoothedSignal[i] /= normalizingCoef; } return smoothedSignal; } public double gaussian(double x, double filterWidth) { double sigma = filterWidth / 2.; return Math.exp(-(x * x) / (2. * sigma * sigma)); } public static String seriesToString(double[] series, NumberFormat df) { StringBuffer sb = new StringBuffer(); sb.append('['); for (double d : series) { sb.append(df.format(d)).append(','); } sb.delete(sb.length() - 2, sb.length() - 1).append("]"); return sb.toString(); } }