package com.linkedin.thirdeye.dataframe; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import org.apache.commons.lang.ArrayUtils; /** * Container for a one-dimensional series of elements with a common primitive type. * Supports transparent conversion between different primitive types and implements * common logic for element management, transformation and aggregation. * * Series are designed to be immutable (albeit with some limitations due to Java's * primitive array model). Operations return new Series instances without modifying * the underlying data structures. */ public abstract class Series { public static final String GROUP_KEY = "key"; public static final String GROUP_VALUE = "value"; public static final String TOSTRING_NULL = "null"; public enum SeriesType { DOUBLE, LONG, STRING, BOOLEAN } enum JoinType { INNER, OUTER, LEFT, RIGHT } /** * Top-level interface to denote a function that may be applied to one (or multiple) series. * Functions may be applied either row-by-row across multiple series or to all values within * a single series. * <br/><b>NOTE:</b> Functions MAY NOT receive a {@code null} value as an input. Rather, if * any one of the input values is {@code null}, the result is set to {@code null} by the * Series framework. * <br/><b>NOTE:</b> Function MAY return {@code null} as a result, however. */ public interface Function { // left blank } public interface Conditional extends Function { // left blank } // @FunctionalInterface public interface DoubleConditional extends Conditional { boolean apply(double... values); } // @FunctionalInterface public interface LongConditional extends Conditional { boolean apply(long... values); } // @FunctionalInterface public interface StringConditional extends Conditional { boolean apply(String... values); } // @FunctionalInterface public interface BooleanConditional extends Conditional { boolean apply(boolean... values); } // @FunctionalInterface public interface DoubleFunction extends Function { double NULL = DoubleSeries.NULL; double apply(double... values); } // @FunctionalInterface public interface LongFunction extends Function { long NULL = LongSeries.NULL; long apply(long... values); } // @FunctionalInterface public interface StringFunction extends Function { String NULL = StringSeries.NULL; String apply(String... values); } // @FunctionalInterface public interface BooleanFunction extends Function { boolean apply(boolean... values); } // @FunctionalInterface public interface BooleanFunctionEx extends Function { byte TRUE = BooleanSeries.TRUE; byte FALSE = BooleanSeries.FALSE; byte NULL = BooleanSeries.NULL; byte apply(byte... values); } /** * Helper container for references generated by grouping */ public static final class Bucket { final int[] fromIndex; Bucket(int[] fromIndex) { this.fromIndex = fromIndex; } public int size() { return this.fromIndex.length; } } /** * Base class for specialized Series builders */ public static abstract class Builder { public abstract Series build(); public abstract Builder addSeries(Collection<Series> series); public Builder addSeries(Series... series) { return this.addSeries(Arrays.asList(series)); } } /** * Grouping container referencing a single series. Holds group keys and the indices of group * elements in the source series. Enables aggregation with custom user functions. */ public static final class SeriesGrouping { final Series keys; final Series source; final List<Bucket> buckets; SeriesGrouping(Series keys, Series source, List<Bucket> buckets) { if(keys.size() != buckets.size()) throw new IllegalArgumentException("key series and bucket count must be equal"); this.keys = keys; this.source = source; this.buckets = buckets; } SeriesGrouping(Series source) { this.keys = LongSeries.buildFrom(); this.source = source; this.buckets = Collections.emptyList(); } /** * Applies index-based groups to a different series. Used by DataFrame for grouping across * multiple series. * * @param s other series * @return SeriesGrouping with different size */ SeriesGrouping applyTo(Series s) { return new SeriesGrouping(this.keys, s, this.buckets); } /** * Returns the number of groups * * @return group count */ public int size() { return this.keys.size(); } /** * Returns the keys of each group in the container as series. * * @return key series */ public Series keys() { return this.keys; } /** * Returns the source series this grouping applies to. * * @return source series */ public Series source() { return this.source; } /** * Returns {@code true} if the grouping container does not hold any groups. * * @return {@code true} is empty, {@code false} otherwise. */ public boolean isEmpty() { return this.keys.isEmpty(); } /** * Applies {@code function} as aggregation function to all values per group and * returns the result as a new DataFrame with the number of elements equal to the size * of the key series. * If the series' native types do not match the required input type of {@code function}, * the series are converted transparently. The native type of the aggregated series is * determined by {@code function}'s output type. * * @param function aggregation function to map to each grouped series * @return grouped aggregation series */ public DataFrame aggregate(Function function) { Builder builder = this.source.getBuilder(); for(Bucket b : this.buckets) { builder.addSeries(this.source.project(b.fromIndex).aggregate(function)); } return makeAggregate(this.keys, builder.build()); } /** * Counts the number of elements in each group and returns the result as a new DataFrame * with the number of elements equal to the size of the key series. * * @return grouped aggregation series */ public DataFrame count() { LongSeries.Builder builder = LongSeries.builder(); for(Bucket b : this.buckets) { builder.addValues(b.size()); } return makeAggregate(this.keys, builder.build()); } static DataFrame makeAggregate(Series keys, Series values) { DataFrame df = new DataFrame(); df.addSeries(GROUP_KEY, keys); df.addSeries(GROUP_VALUE, values); return df; } } /** * Helper container for index-pairs generated by join logic */ static final class JoinPair { final int left; final int right; public JoinPair(int left, int right) { this.left = left; this.right = right; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } JoinPair joinPair = (JoinPair) o; return (left == joinPair.left) && (right == joinPair.right); } @Override public int hashCode() { int result = left; result = 31 * result + right; return result; } } /* ************************************************************************* * Public abstract interface * *************************************************************************/ /** * Returns the number of elements contained in the series. * * <br/><b>NOTE:</b> {@code null} values count as elements. * * @return series size */ public abstract int size(); /** * Returns the series' native type. * * @return series type */ public abstract SeriesType type(); /** * Slices the series from index {@code from} (inclusive) to index {@code to} * (exclusive) and returns the result as a series of the same native type. * * @param from start index (inclusive), must be >= 0 * @param to end index (exclusive), must be <= size * @return sliced series copy */ public abstract Series slice(int from, int to); /** * Returns the value referenced by {@code index} as double. The value is converted * transparently if the native type of the underlying series is different. The * {@code index} must be between {@code 0} and the size of the series. * * @param index index of value * @throws IndexOutOfBoundsException if index is outside the series bounds * @return double value */ public abstract double getDouble(int index); /** * Returns the value referenced by {@code index} as long. The value is converted * transparently if the native type of the underlying series is different. The * {@code index} must be between {@code 0} and the size of the series. * * @param index index of value * @throws IndexOutOfBoundsException if index is outside the series bounds * @return long value */ public abstract long getLong(int index); /** * Returns the value referenced by {@code index} as byte (tri-state boolean). * The value is converted transparently if the native type of the underlying * series is different. The {@code index} must be between {@code 0} and the * size of the series. * * @param index index of value * @throws IndexOutOfBoundsException if index is outside the series bounds * @return byte value */ public abstract byte getBoolean(int index); /** * Returns the value referenced by {@code index} as String. The value is converted * transparently if the native type of the underlying series is different. The * {@code index} must be between {@code 0} and the size of the series. * * @param index index of value * @throws IndexOutOfBoundsException if index is outside the series bounds * @return string value */ public abstract String getString(int index); /** * Returns {@code true} if the value referenced by {@code index} is null. Otherwise, * returns {@code false}. * * @param index index of value * @throws IndexOutOfBoundsException if index is outside the series bounds * @return {@code true} if value is null, otherwise {@code false} */ public abstract boolean isNull(int index); /** * Returns a human-readable String representation of the value referenced by {@code index}. * * @param index index of value * @throws IndexOutOfBoundsException if index is outside the series bounds * @return human-readable string representation */ public abstract String toString(int index); /** * Returns a copy of the series with values ordered in ascending order. * * <br/><b>NOTE:</b> BooleanSeries interprets {@code false} as smaller than {@code true}. * * @return sorted series copy */ public abstract Series sorted(); /** * Returns a copy of the series with {@code null} values replaced by the series' default * value. * * @return series copy with filled nulls */ public abstract Series fillNull(); /** * Returns a new builder instance for the native type encapsulated by this series. * * @return series builder */ public abstract Builder getBuilder(); /** * Returns a copy of the series with values replaced by {@code null} for every row in * {@code filter} that is not {@code true}. * * @param mask series to filter by * @return filtered series copy */ public abstract Series filter(BooleanSeries mask); /* ************************************************************************* * Internal abstract interface * *************************************************************************/ /** * Returns projection of the series. * * <br/><b>NOTE:</b> fromIndex <= -1 is filled with {@code null}. * <br/><b>NOTE:</b> array with length 0 returns empty series. * <br/><b>NOTE:</b> could replace {@code slice(int, int)}, but low performance * * @param fromIndex array with indices to project from (must be <= series size) * @return series projection */ abstract Series project(int[] fromIndex); /** * Compares values across two series with potentially different native types based on index. * If the types are different the values in {@code that} are transparently converted to the * native type of this series. * * <br/><b>Note:</b> the transparent conversion may cause different behavior between * {@code this.compare(that)} and {@code that.compare(this)}. * * @param that other series with same native type (may reference itself) * @param indexThis index in this series * @param indexThat index in the other series * @return 0 if the referenced values are equal, -1 if {@code this} is less than {@code that}, 1 otherwise */ abstract int compare(Series that, int indexThis, int indexThat); /** * Returns an array of indices with a size equal to the series size, such that the values * references by the indices are sorted in ascending order. * * <br/><b>NOTE:</b> output can be used directly by {@code project()} to create a sorted copy of the series. * * @return indices of sorted values */ abstract int[] sortedIndex(); /* ************************************************************************* * Public interface * *************************************************************************/ /** * Returns series {@code s} converted to type {@code type} unless native type matches already. * * @param type target type * @return converted series */ public final Series get(Series.SeriesType type) { switch(type) { case DOUBLE: return this.getDoubles(); case LONG: return this.getLongs(); case BOOLEAN: return this.getBooleans(); case STRING: return this.getStrings(); default: throw new IllegalArgumentException(String.format("Unknown series type '%s'", type)); } } /** * Returns a the series as DoubleSeries. The underlying series is converted * transparently if the series' native type is different. * * @return DoubleSeries equivalent */ public DoubleSeries getDoubles() { double[] values = new double[this.size()]; for(int i=0; i<this.size(); i++) { values[i] = this.getDouble(i); } return DoubleSeries.buildFrom(values); } /** * Returns the series as LongSeries. The underlying series is converted * transparently if the series' native type is different. * * @return LongSeries equivalent */ public LongSeries getLongs() { long[] values = new long[this.size()]; for(int i=0; i<this.size(); i++) { values[i] = this.getLong(i); } return LongSeries.buildFrom(values); } /** * Returns the series as BooleanSeries. The underlying series is converted * transparently if the series' native type is different. * * @return BooleanSeries equivalent */ public BooleanSeries getBooleans() { byte[] values = new byte[this.size()]; for(int i=0; i<this.size(); i++) { values[i] = this.getBoolean(i); } return BooleanSeries.buildFrom(values); } /** * Returns the series as StringSeries. The underlying series is converted * transparently if the series' native type is different. * * @return StringSeries equivalent */ public StringSeries getStrings() { String[] values = new String[this.size()]; for(int i=0; i<this.size(); i++) { values[i] = this.getString(i); } return StringSeries.buildFrom(values); } /** * Returns as copy of the series with the same native type. * * @return series copy */ public Series copy() { return this.slice(0, this.size()); } /** * Returns a copy of the series with values from {@code other} * appended at the end. If {@code other} has different native types they are * converted transparently. * * <br/><b>NOTE:</b> newSize = oldSize + otherSize * * @param other other series to append at the end * @return concatenated series */ public Series append(Series... other) { return this.getBuilder().addSeries(this).addSeries(other).build(); } /** * Fills {@code null} values in the series with a copy of the last valid value. The index * is traversed in ascending order. If the last valid value does not exist (such as for the * first element in a series) it is left at {@code null}. * * @return forward filled series */ public Series fillNullForward() { int lastValueIndex = -1; int[] fromIndex = new int[this.size()]; for(int i=0; i<this.size(); i++) { if(!isNull(i)) lastValueIndex = i; fromIndex[i] = lastValueIndex; } return this.project(fromIndex); } /** * Fills {@code null} values in the series with a copy of the last valid value. The index * is traversed in descending order. If the last valid value does not exist (such as for the * last element in a series) it is left at {@code null}. * * @return backward filled series */ public Series fillNullBackward() { int lastValueIndex = -1; int[] fromIndex = new int[this.size()]; for(int i=this.size()-1; i>=0; i--) { if(!isNull(i)) lastValueIndex = i; fromIndex[i] = lastValueIndex; } return this.project(fromIndex); } /** * Returns a copy of the series with all values' indices * shifted by {@code offset} positions while * leaving the series size unchanged. Values shifted outside to upper (or lower) * bounds of the series are dropped. Vacated positions are padded with {@code null}. * * <br/><b>NOTE:</b> for each value, newIndex = oldIndex + offset * * @param offset offset to shift values by. Can be positive or negative. * @return shifted series copy */ // NOTE: override for performance public Series shift(int offset) { int[] fromIndex = new int[this.size()]; int from = 0; for(int i=0; i<Math.min(offset, this.size()); i++) { fromIndex[from++] = -1; } for(int i=Math.max(offset, 0); i<Math.max(Math.min(this.size() + offset, this.size()), 0); i++) { fromIndex[from++] = i - offset; } for(int i=Math.max(this.size() + offset, 0); i<this.size(); i++) { fromIndex[from++] = -1; } return this.project(fromIndex); } /** * Returns {@code true} is there are no values in the series. Otherwise returns {@code false}. * * <br/><b>NOTE:</b> {@code null} values count as elements. * * @return {@code true} if empty, {@code false} otherwise */ public final boolean isEmpty() { return this.size() <= 0; } /** * Returns {@code true} if the series contains at least one {@code null}. Otherwise * returns {@code false}. * * @return {@code true} if empty, {@code false} otherwise */ public final boolean hasNull() { return this.count() < this.size(); } /** * Returns the number of non-null values in the series. * * @return count of non-null values */ public final int count() { int countNotNull = 0; for(int i=0; i<this.size(); i++) if(!this.isNull(i)) countNotNull++; return countNotNull; } /** * Returns a copy of the series containing at maximum the first {@code n} elements of the series. * If {@code n} is larger than the series size, the entire series is returned. Additional values * to make up the difference between {@code n} and the size are not padded. * * @param n number of elements * @return series copy with at most the first {@code n} elements */ public Series head(int n) { return this.slice(0, Math.min(n, this.size())); } /** * Returns a copy of the series containing at maximum the last {@code n} elements of the series. * If {@code n} is larger than the series size, the entire series is returned. Additional values * to make up the difference between {@code n} and the size are not padded. * * @param n number of elements * @return series copy with at most the last {@code n} elements */ public Series tail(int n) { int len = this.size(); return this.slice(len - Math.min(n, len), len); } /** * Returns a copy of the series omitting any elements before index {@code n}. * If {@code n} is {@code 0}, the entire series is returned. If {@code n} is greater than * the series size, an empty series is returned. * * @param from start index of copy (inclusive) * @return series copy with elements from index {@code from}. */ public Series sliceFrom(int from) { return this.slice(Math.max(from, 0), this.size()); } /** * Returns a copy of the series omitting any elements equal to or after index {@code n}. * If {@code n} is equal or greater than the series size, the entire series is returned. * If {@code n} is {@code 0}, an empty series is returned. * * @param to end index of copy (exclusive) * @return series copy with elements before from index {@code from}. */ public Series sliceTo(int to) { return this.slice(0, Math.min(to, this.size())); } /** * Returns a copy of the series with elements in reverse order from the original series. * * @return reversed series */ public Series reverse() { int[] fromIndex = new int[this.size()]; for (int i = 0; i < fromIndex.length; i++) { fromIndex[i] = fromIndex.length - i - 1; } return this.project(fromIndex); } /** * Returns a copy of the series with each distinct value of the * source series appearing exactly once. The values are further sorted in ascending order. * * @return sorted series copy with distinct unique values */ public Series unique() { if(this.size() <= 1) return this; Series sorted = this.sorted(); List<Integer> indices = new ArrayList<>(); indices.add(0); for(int i=1; i<this.size(); i++) { if(sorted.compare(sorted, i-1, i) != 0) indices.add(i); } int[] fromIndex = ArrayUtils.toPrimitive(indices.toArray(new Integer[indices.size()])); return sorted.project(fromIndex); } /** * Returns a copy of the series omitting any {@code null} values. * * @return series copy without {@code nulls} */ public Series dropNull() { int[] fromIndex = new int[this.size()]; int count = 0; for(int i=0; i<this.size(); i++) { if(!isNull(i)) fromIndex[count++] = i; } return this.project(Arrays.copyOf(fromIndex, count)); } /** * Returns a BooleanSeries which contains a value indicating the null-equivalence for each * value in the original series (this). * * @return boolean series indicating null-equivalence of each value */ public BooleanSeries isNull() { byte[] values = new byte[this.size()]; for(int i=0; i<this.size(); i++) { values[i] = BooleanSeries.valueOf(this.isNull(i)); } return BooleanSeries.buildFrom(values); } /** * Returns a copy of the series with values replaced by {@code null} for every row in * the result of applying {@code conditional} to the series that is not {@code true}. * * @param conditional conditional to apply and filter by * @return filtered series copy */ public Series filter(Conditional conditional) { return this.filter(this.map(conditional)); } // // NOTE: co-variant method messiness // /** * Applies {@code function} to the series row by row and returns the results as a new series. * If the series' native types do not match the required input type of {@code function}, * the series are converted transparently. The native type of the returned series is * determined by {@code function}'s output type. * * @param function function to apply to each row * @param series series to apply function to * @return series with evaluation results */ public static Series map(Function function, Series... series) { if(function instanceof DoubleFunction) { return DoubleSeries.map((DoubleFunction)function, series); } else if(function instanceof LongFunction) { return LongSeries.map((LongFunction)function, series); } else if(function instanceof StringFunction) { return StringSeries.map((StringFunction)function, series); } else if(function instanceof BooleanFunction) { return BooleanSeries.map((BooleanFunction)function, series); } else if(function instanceof BooleanFunctionEx) { return BooleanSeries.map((BooleanFunctionEx)function, series); } else if(function instanceof DoubleConditional) { return DoubleSeries.map((DoubleConditional)function, series); } else if(function instanceof LongConditional) { return LongSeries.map((LongConditional)function, series); } else if(function instanceof StringConditional) { return StringSeries.map((StringConditional)function, series); } else if(function instanceof BooleanConditional) { return BooleanSeries.map((BooleanConditional)function, series); } throw new IllegalArgumentException(String.format("Unknown function type '%s'", function.getClass())); } /** * Applies {@code function} to the series row by row and returns the results as a new series. * If the series' native type does not match the required input type of {@code function}, * the series is converted transparently. The native type of the returned series is * determined by {@code function}'s output type. * * @param function function to map to each element in the series * @return series with evaluation results */ public final Series map(Function function) { return map(function, this); } /** * @see Series#map(Function) */ public final DoubleSeries map(DoubleFunction function) { return (DoubleSeries)map(function, this); } /** * @see Series#map(Function) */ public final LongSeries map(LongFunction function) { return (LongSeries)map(function, this); } /** * @see Series#map(Function) */ public final StringSeries map(StringFunction function) { return (StringSeries)map(function, this); } /** * @see Series#map(Function) */ public final BooleanSeries map(BooleanFunction function) { return (BooleanSeries)map(function, this); } /** * @see Series#map(Function) */ public final BooleanSeries map(BooleanFunctionEx function) { return (BooleanSeries)map(function, this); } /** * @see Series#map(Function) */ public final BooleanSeries map(Conditional conditional) { return (BooleanSeries)map(conditional, this); } // // NOTE: co-variant method messiness // /** * Applies {@code function} as aggregation function to all values in the series at once and * returns the result as a new series with a single element. * If the series' native type does not match the required input type of {@code function}, * the series is converted transparently. The native type of the returned series is * determined by {@code function}'s output type. * * @param function aggregation function to map to the series * @return single element series */ public final Series aggregate(Function function) { if(function instanceof DoubleFunction) { return DoubleSeries.aggregate((DoubleFunction)function, this); } else if(function instanceof LongFunction) { return LongSeries.aggregate((LongFunction)function, this); } else if(function instanceof StringFunction) { return StringSeries.aggregate((StringFunction)function, this); } else if(function instanceof BooleanFunction) { return BooleanSeries.aggregate((BooleanFunction)function, this); } else if(function instanceof BooleanFunctionEx) { return BooleanSeries.aggregate((BooleanFunctionEx)function, this); } else if(function instanceof DoubleConditional) { return DoubleSeries.aggregate((DoubleConditional)function, this); } else if(function instanceof LongConditional) { return LongSeries.aggregate((LongConditional)function, this); } else if(function instanceof StringConditional) { return StringSeries.aggregate((StringConditional)function, this); } else if(function instanceof BooleanConditional) { return BooleanSeries.aggregate((BooleanConditional)function, this); } throw new IllegalArgumentException(String.format("Unknown function type '%s'", function.getClass())); } /** * @see Series#aggregate(Function) */ public final DoubleSeries aggregate(DoubleFunction function) { return (DoubleSeries)this.aggregate((Function)function); } /** * @see Series#aggregate(Function) */ public final LongSeries aggregate(LongFunction function) { return (LongSeries)this.aggregate((Function)function); } /** * @see Series#aggregate(Function) */ public final StringSeries aggregate(StringFunction function) { return (StringSeries)this.aggregate((Function)function); } /** * @see Series#aggregate(Function) */ public final BooleanSeries aggregate(BooleanFunction function) { return (BooleanSeries)this.aggregate((Function)function); } /** * @see Series#aggregate(Function) */ public final BooleanSeries aggregate(BooleanFunctionEx function) { return (BooleanSeries)this.aggregate((Function)function); } /** * @see Series#aggregate(Function) */ public final BooleanSeries aggregate(Conditional conditional) { return (BooleanSeries)this.aggregate((Function)conditional); } /** * Returns a SeriesGrouping based on value. Elements are grouped into separate buckets for each * distinct value in the series. * * <br/><b>NOTE:</b> the resulting keys are equivalent to calling {@code unique()} on the series. * * @return grouping by value */ public final SeriesGrouping groupByValue() { if(this.isEmpty()) return new SeriesGrouping(this); List<Bucket> buckets = new ArrayList<>(); int[] sref = this.sortedIndex(); int bucketOffset = 0; for(int i=1; i<sref.length; i++) { if(this.compare(this, sref[i-1], sref[i]) != 0) { int[] fromIndex = Arrays.copyOfRange(sref, bucketOffset, i); buckets.add(new Bucket(fromIndex)); bucketOffset = i; } } int[] fromIndex = Arrays.copyOfRange(sref, bucketOffset, sref.length); buckets.add(new Bucket(fromIndex)); // keys from buckets int[] keyIndex = new int[buckets.size()]; int i = 0; for(Bucket b : buckets) { keyIndex[i++] = b.fromIndex[0]; } return new SeriesGrouping(this.project(keyIndex), this, buckets); } /** * Returns a SeriesGrouping based on element count per buckets. Elements are grouped into buckets * based on a greedy algorithm with fixed bucket size. The size of all buckets (except for the * last) is guaranteed to be equal to {@code bucketSize}. * * @param bucketSize maximum number of elements per bucket * @return grouping by element count */ public final SeriesGrouping groupByCount(int bucketSize) { if(bucketSize <= 0) throw new IllegalArgumentException("bucketSize must be greater than 0"); if(this.isEmpty()) return new SeriesGrouping(this); bucketSize = Math.min(bucketSize, this.size()); int numBuckets = (this.size() - 1) / bucketSize + 1; long[] keys = new long[numBuckets]; List<Bucket> buckets = new ArrayList<>(); for(int i=0; i<numBuckets; i++) { int from = i*bucketSize; int to = Math.min((i+1)*bucketSize, this.size()); int[] fromIndex = new int[to-from]; for(int j=0; j<fromIndex.length; j++) { fromIndex[j] = j + from; } buckets.add(new Bucket(fromIndex)); keys[i] = i; } return new SeriesGrouping(DataFrame.toSeries(keys), this, buckets); } /** * Returns a SeriesGrouping based on a fixed number of buckets. Elements are grouped into buckets * based on a greedy algorithm to approximately evenly fill buckets. The number of buckets * is guaranteed to be equal to {@code partitionCount} even if some remain empty. * * @param partitionCount number of buckets * @return grouping by bucket count */ public final SeriesGrouping groupByPartitions(int partitionCount) { if(partitionCount <= 0) throw new IllegalArgumentException("partitionCount must be greater than 0"); if(this.isEmpty()) return new SeriesGrouping(this); double perPartition = this.size() / (double)partitionCount; long[] keys = new long[partitionCount]; List<Bucket> buckets = new ArrayList<>(); for(int i=0; i<partitionCount; i++) { int from = (int)Math.round(i * perPartition); int to = (int)Math.round((i+1) * perPartition); int[] fromIndex = new int[to-from]; for(int j=0; j<fromIndex.length; j++) { fromIndex[j] = j + from; } buckets.add(new Bucket(fromIndex)); keys[i] = i; } return new SeriesGrouping(DataFrame.toSeries(keys), this, buckets); } /** * Returns an (overlapping) SeriesGrouping base on a moving window size. Elements are grouped * into overlapping buckets in sequences of {@code windowSize} consecutive items. The number * of buckets is guaranteed to be equal to {@code series_size - moving_window_size + 1}, or * 0 if the window size is greater than the series size. * * @param windowSize size of moving window * @return grouping by moving window */ public final SeriesGrouping groupByMovingWindow(int windowSize) { if(windowSize <= 0) throw new IllegalArgumentException("windowSize must be greater than 0"); if(this.size() < windowSize) return new SeriesGrouping(this); int windowCount = this.size() - windowSize + 1; long[] keys = new long[windowCount]; List<Bucket> buckets = new ArrayList<>(); for(int i=0; i<windowCount; i++) { keys[i] = i; int[] fromIndex = new int[windowSize]; for(int j=0; j<windowSize; j++) { fromIndex[j] = i + j; } buckets.add(new Bucket(fromIndex)); } return new SeriesGrouping(DataFrame.toSeries(keys), this, buckets); } /** * Returns a concatenation of {@code series} as a new series with a native type equal * to the first series. If subsequent series have different native types they are * converted transparently. * * @param series series to concatenate * @return concatenated series */ public static Series concatenate(Series... series) { if(series.length <= 0) throw new IllegalArgumentException("Must concatenate at least one series"); Series first = series[0]; Series[] rest = Arrays.copyOfRange(series, 1, series.length); return first.append(rest); } /* ************************************************************************* * Internal interface * *************************************************************************/ /** * Returns index tuples (pairs) for a join performed based on value. * * <br/><b>NOTE:</b> the implementation uses merge join. Thus, the index pairs reference * values in ascending order. * * @see Series#compare(Series, int, int) * * @param other series to match values against * @param type type of join to perform * @return list of index pairs for join */ List<JoinPair> join(Series other, JoinType type) { // NOTE: merge join int[] lref = this.sortedIndex(); int[] rref = other.sortedIndex(); List<JoinPair> pairs = new ArrayList<>(); int i = 0; int j = 0; while(i < this.size() || j < other.size()) { if(j >= other.size() || (i < this.size() && this.compare(other, lref[i], rref[j]) < 0)) { switch(type) { case LEFT: case OUTER: pairs.add(new JoinPair(lref[i], -1)); default: } i++; } else if(i >= this.size() || (j < other.size() && this.compare(other, lref[i], rref[j]) > 0)) { switch(type) { case RIGHT: case OUTER: pairs.add(new JoinPair(-1, rref[j])); default: } j++; } else if(i < this.size() && j < other.size()) { // generate cross product // count similar values on the left int lcount = 1; while(i + lcount < this.size() && this.compare(this, lref[i + lcount], lref[i + lcount - 1]) == 0) { lcount++; } // count similar values on the right int rcount = 1; while(j + rcount < other.size() && other.compare(other, rref[j + rcount], rref[j + rcount - 1]) == 0) { rcount++; } for(int l=0; l<lcount; l++) { for(int r=0; r<rcount; r++) { pairs.add(new JoinPair(lref[i + l], rref[j + r])); } } i += lcount; j += rcount; } } return pairs; } /* ************************************************************************** * Code grave ***************************************************************************/ // NOTE: too slow // public Series sorted() { // return this.project(this.sortedIndex()); // } // NOTE: too slow // int[] sortedIndex() { // Integer[] fromIndex = new Integer[this.size()]; // for(int i=0; i<this.size(); i++) // fromIndex[i] = i; // // final Series s = this; // Arrays.sort(fromIndex, new Comparator<Integer>() { // @Override // public int compare(Integer o1, Integer o2) { // return s.compare(s, o1, o2); // } // }); // // return ArrayUtils.toPrimitive(fromIndex); // } }