package net.seninp.jmotif.sax; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import net.seninp.jmotif.sax.alphabet.Alphabet; /** * Implements algorithms for low-level data manipulation. * * @author Pavel Senin * */ public class TSProcessor { private static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8; /** The latin alphabet, lower case letters a-z. */ public static final char[] ALPHABET = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; // static block - we instantiate the logger // private static final Logger LOGGER = LoggerFactory.getLogger(TSProcessor.class); /** * Constructor. */ public TSProcessor() { super(); } /** * Reads timeseries from a file. Assumes that file has a single double value on every line. * Assigned timestamps are the line numbers. * * @param filename The file to read from. * @param columnIdx The column index. * @param sizeLimit The number of lines to read, 0 == all. * @return data. * @throws IOException if error occurs. * @throws SAXException if error occurs. */ public static double[] readFileColumn(String filename, int columnIdx, int sizeLimit) throws IOException, SAXException { // make sure the path exists Path path = Paths.get(filename); if (!(Files.exists(path))) { throw new SAXException("unable to load data - data source not found."); } BufferedReader br = new BufferedReader( new InputStreamReader(new FileInputStream(filename), "UTF-8")); return readTS(br, columnIdx, sizeLimit); } /** * Reads timeseries from a file. Assumes that file has a single double value on every line. * Assigned timestamps are the line numbers. * * @param br The reader to use. * @param columnIdx The column index. * @param sizeLimit The number of lines to read, 0 == all. * @return data. * @throws IOException if error occurs. * @throws SAXException if error occurs. */ public static double[] readTS(BufferedReader br, int columnIdx, int sizeLimit) throws IOException, SAXException { ArrayList<Double> preRes = new ArrayList<Double>(); int lineCounter = 0; String line = null; while ((line = br.readLine()) != null) { String[] split = line.trim().split("\\s+"); if (split.length < columnIdx) { String message = "Unable to read data from column " + columnIdx; br.close(); throw new SAXException(message); } String str = split[columnIdx]; double num = Double.NaN; try { num = Double.valueOf(str); } catch (NumberFormatException e) { LOGGER.info("Skipping the row " + lineCounter + " with value \"" + str + "\""); continue; } preRes.add(num); lineCounter++; if ((0 != sizeLimit) && (lineCounter >= sizeLimit)) { break; } } br.close(); double[] res = new double[preRes.size()]; for (int i = 0; i < preRes.size(); i++) { res[i] = preRes.get(i); } return res; } /** * Read at least N elements from the one-column file. * * @param dataFileName the file name. * @param loadLimit the load limit. * @return the read data or empty array if nothing to load. * @throws SAXException if error occurs. * @throws IOException if error occurs. */ public double[] readTS(String dataFileName, int loadLimit) throws SAXException, IOException { Path path = Paths.get(dataFileName); if (!(Files.exists(path))) { throw new SAXException("unable to load data - data source not found."); } BufferedReader reader = Files.newBufferedReader(path, DEFAULT_CHARSET); return readTS(reader, 0, loadLimit); } /** * Finds the maximal value in timeseries. * * @param series The timeseries. * @return The max value. */ public double max(double[] series) { double max = Double.MIN_VALUE; for (int i = 0; i < series.length; i++) { if (max < series[i]) { max = series[i]; } } return max; } /** * Finds the minimal value in timeseries. * * @param series The timeseries. * @return The min value. */ public double min(double[] series) { double min = Double.MAX_VALUE; for (int i = 0; i < series.length; i++) { if (min > series[i]) { min = series[i]; } } return min; } /** * Computes the mean value of timeseries. * * @param series The timeseries. * @return The mean value. */ public double mean(double[] series) { double res = 0D; int count = 0; for (double tp : series) { res += tp; count += 1; } if (count > 0) { return res / ((Integer) count).doubleValue(); } return Double.NaN; } /** * Computes the mean value of timeseries. * * @param series The timeseries. * @return The mean value. */ public double mean(int[] series) { double res = 0D; int count = 0; for (int tp : series) { res += (double) tp; count += 1; } if (count > 0) { return res / ((Integer) count).doubleValue(); } return Double.NaN; } /** * Computes the median value of timeseries. * * @param series The timeseries. * @return The median value. */ public double median(double[] series) { double[] clonedSeries = series.clone(); Arrays.sort(clonedSeries); double median; if (clonedSeries.length % 2 == 0) { median = (clonedSeries[clonedSeries.length / 2] + (double) clonedSeries[clonedSeries.length / 2 - 1]) / 2; } else { median = clonedSeries[clonedSeries.length / 2]; } return median; } /** * Compute the variance of timeseries. * * @param series The timeseries. * @return The variance. */ public double var(double[] series) { double res = 0D; double mean = mean(series); int count = 0; for (double tp : series) { res += (tp - mean) * (tp - mean); count += 1; } if (count > 0) { return res / ((Integer) (count - 1)).doubleValue(); } return Double.NaN; } /** * Speed-optimized implementation. * * @param series The timeseries. * @return the standard deviation. */ public double stDev(double[] series) { double num0 = 0D; double sum = 0D; int count = 0; for (double tp : series) { num0 = num0 + tp * tp; sum = sum + tp; count += 1; } double len = ((Integer) count).doubleValue(); return Math.sqrt((len * num0 - sum * sum) / (len * (len - 1))); } /** * Z-Normalize routine. * * @param series the input timeseries. * @param normalizationThreshold the zNormalization threshold value. * @return Z-normalized time-series. */ public double[] znorm(double[] series, double normalizationThreshold) { double[] res = new double[series.length]; double mean = mean(series); double sd = stDev(series); if (sd < normalizationThreshold) { // return series.clone(); // return array of zeros return res; } for (int i = 0; i < res.length; i++) { res[i] = (series[i] - mean) / sd; } return res; } /** * Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as * follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there * are some (more or equal one) values happened to be in the piece - algorithm will handle it as * usual - getting the mean. * * @param ts The timeseries to approximate. * @param paaSize The desired length of approximated timeseries. * @return PAA-approximated timeseries. * @throws SAXException if error occurs. * */ public double[] paa(double[] ts, int paaSize) throws SAXException { // fix the length int len = ts.length; if (len < paaSize) { throw new SAXException("PAA size can't be greater than the timeseries size."); } // check for the trivial case if (len == paaSize) { return Arrays.copyOf(ts, ts.length); } else { double[] paa = new double[paaSize]; double pointsPerSegment = (double) len / (double) paaSize; double[] breaks = new double[paaSize + 1]; for (int i = 0; i < paaSize + 1; i++) { breaks[i] = i * pointsPerSegment; } for (int i = 0; i < paaSize; i++) { double segStart = breaks[i]; double segEnd = breaks[i + 1]; double fractionStart = Math.ceil(segStart) - segStart; double fractionEnd = segEnd - Math.floor(segEnd); int fullStart = Double.valueOf(Math.floor(segStart)).intValue(); int fullEnd = Double.valueOf(Math.ceil(segEnd)).intValue(); double[] segment = Arrays.copyOfRange(ts, fullStart, fullEnd); if (fractionStart > 0) { segment[0] = segment[0] * fractionStart; } if (fractionEnd > 0) { segment[segment.length - 1] = segment[segment.length - 1] * fractionEnd; } double elementsSum = 0.0; for (double e : segment) { elementsSum = elementsSum + e; } paa[i] = elementsSum / pointsPerSegment; } return paa; } } /** * Converts the timeseries into string using given cuts intervals. Useful for not-normal * distribution cuts. * * @param vals The timeseries. * @param cuts The cut intervals. * @return The timeseries SAX representation. */ public char[] ts2String(double[] vals, double[] cuts) { char[] res = new char[vals.length]; for (int i = 0; i < vals.length; i++) { res[i] = num2char(vals[i], cuts); } return res; } /** * Convert the timeseries into the index using SAX cuts. * * @param series The timeseries to convert. * @param alphabet The alphabet to use. * @param alphabetSize The alphabet size in use. * @return SAX representation of timeseries. * @throws Exception if error occurs. */ public int[] ts2Index(double[] series, Alphabet alphabet, int alphabetSize) throws Exception { double[] cuts = alphabet.getCuts(alphabetSize); int[] res = new int[series.length]; for (int i = 0; i < series.length; i++) { res[i] = num2index(series[i], cuts); } return res; } /** * Get mapping of a number to char. * * @param value the value to map. * @param cuts the array of intervals. * @return character corresponding to numeric value. */ public char num2char(double value, double[] cuts) { int count = 0; while ((count < cuts.length) && (cuts[count] <= value)) { count++; } return ALPHABET[count]; } /** * Converts index into char. * * @param idx The index value. * @return The char by index. */ public char num2char(int idx) { return ALPHABET[idx]; } /** * Get mapping of number to cut index. * * @param value the value to map. * @param cuts the array of intervals. * @return character corresponding to numeric value. */ public int num2index(double value, double[] cuts) { int count = 0; while ((count < cuts.length) && (cuts[count] <= value)) { count++; } return count; } /** * Extract subseries out of series. * * @param series The series array. * @param start the fragment start. * @param end the fragment end. * @return The subseries. * @throws IndexOutOfBoundsException If error occurs. */ public double[] subseriesByCopy(double[] series, int start, int end) throws IndexOutOfBoundsException { if ((start > end) || (start < 0) || (end > series.length)) { throw new IndexOutOfBoundsException("Unable to extract subseries, series length: " + series.length + ", start: " + start + ", end: " + String.valueOf(end - start)); } return Arrays.copyOfRange(series, start, end); } /** * Prettyfies the timeseries for screen output. * * @param series the data. * @param df the number format to use. * * @return The timeseries formatted for screen output. */ public String seriesToString(double[] series, NumberFormat df) { StringBuffer sb = new StringBuffer(); sb.append('['); for (double d : series) { sb.append(df.format(d)).append(','); } sb.delete(sb.length() - 2, sb.length() - 1).append("]"); return sb.toString(); } /** * Normalizes data in interval 0-1. * * @param data the dataset. * @return normalized dataset. */ public double[] normOne(double[] data) { double[] res = new double[data.length]; double max = max(data); for (int i = 0; i < data.length; i++) { res[i] = data[i] / max; } return res; } }