TSProcessor.java example

Explorer

SAX-master
- src
  - main
    - java
      - net
        seninp
        jmotif
        cbf
        CBFGenerator.java
        distance
        EuclideanDistance.java
        sax
        NumerosityReductionStrategy.java
        SAXCLIConverter.java
        SAXCLIParameters.java
        SAXException.java
        SAXProcessor.java
        TSProcessor.java
        alphabet
        Alphabet.java
        NormalAlphabet.java
        bitmap
        BitmapParameters.java
        Shingles.java
        TSBitmapPrinter.java
        UCRdataBitmapPrinter.java
        datastructure
        DoublyLinkedSortedList.java
        FrequencyTableEntry.java
        SAXRecord.java
        SAXRecords.java
        discord
        BruteForceDiscordImplementation.java
        DiscordRecord.java
        DiscordRecords.java
        HOTSAXImplementation.java
        parallel
        ParallelSAXImplementation.java
        SAXWorker.java
        registry
        LargeWindowAlgorithm.java
        MagicArrayEntry.java
        SlidingWindowMarkerAlgorithm.java
        VisitRegistry.java
        tinker
        DiscordDiscoveryPerformanceDoubleLong.java
        DiscordDiscoveryPerformanceNew.java
        DiscordDiscoveryPerformanceOld.java
        JmotifRSAXTestHelper.java
        KalpakisConverter.java
        MoviePrinter.java
        ParallelPerformanceEvaluation.java
        PrintSAXProcess.java
        util
        HeatChart.java
        JmotifMapEntry.java
        SortedArrayList.java
        StackTrace.java
        StdRandom.java
        UCRUtils.java
  - test
    - java
      - net
        seninp
        jmotif
        cbf
        TestCBFGenerator.java
        distance
        TestEuclideanDistance.java
        issues
        TestIssue11.java
        sax
        TestApproxDistance.java
        TestNRStrategy.java
        TestNormalAlphabet.java
        TestSAXException.java
        TestSAXProcessor.java
        TestShingling.java
        TestTSProcessor.java
        datastructures
        TestDoublyLinkedSortedList.java
        TestFrequencyTableEntry.java
        TestMagicArrayEntry.java
        TestMotifDiscovery.java
        TestSAXRecord.java
        TestSAXRecords.java
        discord
        TestDiscordDiscoveryEXACT.java
        TestDiscordDiscoveryMINDIST.java
        TestDiscordDiscoveryNONE.java
        TestDiscordRecord.java
        TestDiscordRecords.java
        parallel
        TestParallelInterruption.java
        TestParallelSAXImplementation.java
        registry
        TestVisitRegistry.java
        util
        TestMapEntry.java
        TestStackTrace.java
        TestUCRUtils.java

package net.seninp.jmotif.sax;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import net.seninp.jmotif.sax.alphabet.Alphabet;

/**
 * Implements algorithms for low-level data manipulation.
 * 
 * @author Pavel Senin
 * 
 */
public class TSProcessor {

  private static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;

  /** The latin alphabet, lower case letters a-z. */
  public static final char[] ALPHABET = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
      'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' };

  // static block - we instantiate the logger
  //
  private static final Logger LOGGER = LoggerFactory.getLogger(TSProcessor.class);

  /**
   * Constructor.
   */
  public TSProcessor() {
    super();
  }

  /**
   * Reads timeseries from a file. Assumes that file has a single double value on every line.
   * Assigned timestamps are the line numbers.
   * 
   * @param filename The file to read from.
   * @param columnIdx The column index.
   * @param sizeLimit The number of lines to read, 0 == all.
   * @return data.
   * @throws IOException if error occurs.
   * @throws SAXException if error occurs.
   */
  public static double[] readFileColumn(String filename, int columnIdx, int sizeLimit)
      throws IOException, SAXException {

    // make sure the path exists
    Path path = Paths.get(filename);
    if (!(Files.exists(path))) {
      throw new SAXException("unable to load data - data source not found.");
    }

    BufferedReader br = new BufferedReader(
        new InputStreamReader(new FileInputStream(filename), "UTF-8"));

    return readTS(br, columnIdx, sizeLimit);
  }

  /**
   * Reads timeseries from a file. Assumes that file has a single double value on every line.
   * Assigned timestamps are the line numbers.
   * 
   * @param br The reader to use.
   * @param columnIdx The column index.
   * @param sizeLimit The number of lines to read, 0 == all.
   * @return data.
   * @throws IOException if error occurs.
   * @throws SAXException if error occurs.
   */
  public static double[] readTS(BufferedReader br, int columnIdx, int sizeLimit)
      throws IOException, SAXException {
    ArrayList<Double> preRes = new ArrayList<Double>();
    int lineCounter = 0;

    String line = null;
    while ((line = br.readLine()) != null) {
      String[] split = line.trim().split("\\s+");
      if (split.length < columnIdx) {
        String message = "Unable to read data from column " + columnIdx;
        br.close();
        throw new SAXException(message);
      }
      String str = split[columnIdx];
      double num = Double.NaN;
      try {
        num = Double.valueOf(str);
      }
      catch (NumberFormatException e) {
        LOGGER.info("Skipping the row " + lineCounter + " with value \"" + str + "\"");
        continue;
      }
      preRes.add(num);
      lineCounter++;
      if ((0 != sizeLimit) && (lineCounter >= sizeLimit)) {
        break;
      }
    }
    br.close();
    double[] res = new double[preRes.size()];
    for (int i = 0; i < preRes.size(); i++) {
      res[i] = preRes.get(i);
    }
    return res;

  }

  /**
   * Read at least N elements from the one-column file.
   * 
   * @param dataFileName the file name.
   * @param loadLimit the load limit.
   * @return the read data or empty array if nothing to load.
   * @throws SAXException if error occurs.
   * @throws IOException if error occurs.
   */
  public double[] readTS(String dataFileName, int loadLimit) throws SAXException, IOException {

    Path path = Paths.get(dataFileName);
    if (!(Files.exists(path))) {
      throw new SAXException("unable to load data - data source not found.");
    }

    BufferedReader reader = Files.newBufferedReader(path, DEFAULT_CHARSET);

    return readTS(reader, 0, loadLimit);

  }

  /**
   * Finds the maximal value in timeseries.
   * 
   * @param series The timeseries.
   * @return The max value.
   */
  public double max(double[] series) {
    double max = Double.MIN_VALUE;
    for (int i = 0; i < series.length; i++) {
      if (max < series[i]) {
        max = series[i];
      }
    }
    return max;
  }

  /**
   * Finds the minimal value in timeseries.
   * 
   * @param series The timeseries.
   * @return The min value.
   */
  public double min(double[] series) {
    double min = Double.MAX_VALUE;
    for (int i = 0; i < series.length; i++) {
      if (min > series[i]) {
        min = series[i];
      }
    }
    return min;
  }

  /**
   * Computes the mean value of timeseries.
   * 
   * @param series The timeseries.
   * @return The mean value.
   */
  public double mean(double[] series) {
    double res = 0D;
    int count = 0;
    for (double tp : series) {
      res += tp;
      count += 1;

    }
    if (count > 0) {
      return res / ((Integer) count).doubleValue();
    }
    return Double.NaN;
  }

  /**
   * Computes the mean value of timeseries.
   * 
   * @param series The timeseries.
   * @return The mean value.
   */
  public double mean(int[] series) {
    double res = 0D;
    int count = 0;
    for (int tp : series) {
      res += (double) tp;
      count += 1;

    }
    if (count > 0) {
      return res / ((Integer) count).doubleValue();
    }
    return Double.NaN;
  }

  /**
   * Computes the median value of timeseries.
   * 
   * @param series The timeseries.
   * @return The median value.
   */
  public double median(double[] series) {
    double[] clonedSeries = series.clone();
    Arrays.sort(clonedSeries);

    double median;
    if (clonedSeries.length % 2 == 0) {
      median = (clonedSeries[clonedSeries.length / 2]
          + (double) clonedSeries[clonedSeries.length / 2 - 1]) / 2;
    }
    else {
      median = clonedSeries[clonedSeries.length / 2];
    }
    return median;
  }

  /**
   * Compute the variance of timeseries.
   * 
   * @param series The timeseries.
   * @return The variance.
   */
  public double var(double[] series) {
    double res = 0D;
    double mean = mean(series);
    int count = 0;
    for (double tp : series) {
      res += (tp - mean) * (tp - mean);
      count += 1;
    }
    if (count > 0) {
      return res / ((Integer) (count - 1)).doubleValue();
    }
    return Double.NaN;
  }

  /**
   * Speed-optimized implementation.
   * 
   * @param series The timeseries.
   * @return the standard deviation.
   */
  public double stDev(double[] series) {
    double num0 = 0D;
    double sum = 0D;
    int count = 0;
    for (double tp : series) {
      num0 = num0 + tp * tp;
      sum = sum + tp;
      count += 1;
    }
    double len = ((Integer) count).doubleValue();
    return Math.sqrt((len * num0 - sum * sum) / (len * (len - 1)));
  }

  /**
   * Z-Normalize routine.
   * 
   * @param series the input timeseries.
   * @param normalizationThreshold the zNormalization threshold value.
   * @return Z-normalized time-series.
   */
  public double[] znorm(double[] series, double normalizationThreshold) {
    double[] res = new double[series.length];
    double mean = mean(series);
    double sd = stDev(series);
    if (sd < normalizationThreshold) {
      // return series.clone();
      // return array of zeros
      return res;
    }
    for (int i = 0; i < res.length; i++) {
      res[i] = (series[i] - mean) / sd;
    }
    return res;
  }

  /**
   * Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as
   * follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there
   * are some (more or equal one) values happened to be in the piece - algorithm will handle it as
   * usual - getting the mean.
   * 
   * @param ts The timeseries to approximate.
   * @param paaSize The desired length of approximated timeseries.
   * @return PAA-approximated timeseries.
   * @throws SAXException if error occurs.
   * 
   */
  public double[] paa(double[] ts, int paaSize) throws SAXException {
    // fix the length
    int len = ts.length;
    if (len < paaSize) {
      throw new SAXException("PAA size can't be greater than the timeseries size.");
    }
    // check for the trivial case
    if (len == paaSize) {
      return Arrays.copyOf(ts, ts.length);
    }
    else {
      double[] paa = new double[paaSize];
      double pointsPerSegment = (double) len / (double) paaSize;
      double[] breaks = new double[paaSize + 1];
      for (int i = 0; i < paaSize + 1; i++) {
        breaks[i] = i * pointsPerSegment;
      }

      for (int i = 0; i < paaSize; i++) {
        double segStart = breaks[i];
        double segEnd = breaks[i + 1];

        double fractionStart = Math.ceil(segStart) - segStart;
        double fractionEnd = segEnd - Math.floor(segEnd);

        int fullStart = Double.valueOf(Math.floor(segStart)).intValue();
        int fullEnd = Double.valueOf(Math.ceil(segEnd)).intValue();

        double[] segment = Arrays.copyOfRange(ts, fullStart, fullEnd);

        if (fractionStart > 0) {
          segment[0] = segment[0] * fractionStart;
        }

        if (fractionEnd > 0) {
          segment[segment.length - 1] = segment[segment.length - 1] * fractionEnd;
        }

        double elementsSum = 0.0;
        for (double e : segment) {
          elementsSum = elementsSum + e;
        }

        paa[i] = elementsSum / pointsPerSegment;

      }
      return paa;
    }
  }

  /**
   * Converts the timeseries into string using given cuts intervals. Useful for not-normal
   * distribution cuts.
   * 
   * @param vals The timeseries.
   * @param cuts The cut intervals.
   * @return The timeseries SAX representation.
   */
  public char[] ts2String(double[] vals, double[] cuts) {
    char[] res = new char[vals.length];
    for (int i = 0; i < vals.length; i++) {
      res[i] = num2char(vals[i], cuts);
    }
    return res;
  }

  /**
   * Convert the timeseries into the index using SAX cuts.
   * 
   * @param series The timeseries to convert.
   * @param alphabet The alphabet to use.
   * @param alphabetSize The alphabet size in use.
   * @return SAX representation of timeseries.
   * @throws Exception if error occurs.
   */
  public int[] ts2Index(double[] series, Alphabet alphabet, int alphabetSize) throws Exception {
    double[] cuts = alphabet.getCuts(alphabetSize);
    int[] res = new int[series.length];
    for (int i = 0; i < series.length; i++) {
      res[i] = num2index(series[i], cuts);
    }
    return res;
  }

  /**
   * Get mapping of a number to char.
   * 
   * @param value the value to map.
   * @param cuts the array of intervals.
   * @return character corresponding to numeric value.
   */
  public char num2char(double value, double[] cuts) {
    int count = 0;
    while ((count < cuts.length) && (cuts[count] <= value)) {
      count++;
    }
    return ALPHABET[count];
  }

  /**
   * Converts index into char.
   * 
   * @param idx The index value.
   * @return The char by index.
   */
  public char num2char(int idx) {
    return ALPHABET[idx];
  }

  /**
   * Get mapping of number to cut index.
   * 
   * @param value the value to map.
   * @param cuts the array of intervals.
   * @return character corresponding to numeric value.
   */
  public int num2index(double value, double[] cuts) {
    int count = 0;
    while ((count < cuts.length) && (cuts[count] <= value)) {
      count++;
    }
    return count;
  }

  /**
   * Extract subseries out of series.
   * 
   * @param series The series array.
   * @param start the fragment start.
   * @param end the fragment end.
   * @return The subseries.
   * @throws IndexOutOfBoundsException If error occurs.
   */
  public double[] subseriesByCopy(double[] series, int start, int end)
      throws IndexOutOfBoundsException {
    if ((start > end) || (start < 0) || (end > series.length)) {
      throw new IndexOutOfBoundsException("Unable to extract subseries, series length: "
          + series.length + ", start: " + start + ", end: " + String.valueOf(end - start));
    }
    return Arrays.copyOfRange(series, start, end);
  }

  /**
   * Prettyfies the timeseries for screen output.
   * 
   * @param series the data.
   * @param df the number format to use.
   * 
   * @return The timeseries formatted for screen output.
   */
  public String seriesToString(double[] series, NumberFormat df) {
    StringBuffer sb = new StringBuffer();
    sb.append('[');
    for (double d : series) {
      sb.append(df.format(d)).append(',');
    }
    sb.delete(sb.length() - 2, sb.length() - 1).append("]");
    return sb.toString();
  }

  /**
   * Normalizes data in interval 0-1.
   * 
   * @param data the dataset.
   * @return normalized dataset.
   */
  public double[] normOne(double[] data) {
    double[] res = new double[data.length];
    double max = max(data);
    for (int i = 0; i < data.length; i++) {
      res[i] = data[i] / max;
    }
    return res;
  }

}