package com.linkedin.thirdeye.dataframe;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
/**
* Container for a one-dimensional series of elements with a common primitive type.
* Supports transparent conversion between different primitive types and implements
* common logic for element management, transformation and aggregation.
*
* Series are designed to be immutable (albeit with some limitations due to Java's
* primitive array model). Operations return new Series instances without modifying
* the underlying data structures.
*/
public abstract class Series {
public static final String GROUP_KEY = "key";
public static final String GROUP_VALUE = "value";
public static final String TOSTRING_NULL = "null";
public enum SeriesType {
DOUBLE,
LONG,
STRING,
BOOLEAN
}
enum JoinType {
INNER,
OUTER,
LEFT,
RIGHT
}
/**
* Top-level interface to denote a function that may be applied to one (or multiple) series.
* Functions may be applied either row-by-row across multiple series or to all values within
* a single series.
* <br/><b>NOTE:</b> Functions MAY NOT receive a {@code null} value as an input. Rather, if
* any one of the input values is {@code null}, the result is set to {@code null} by the
* Series framework.
* <br/><b>NOTE:</b> Function MAY return {@code null} as a result, however.
*/
public interface Function {
// left blank
}
public interface Conditional extends Function {
// left blank
}
// @FunctionalInterface
public interface DoubleConditional extends Conditional {
boolean apply(double... values);
}
// @FunctionalInterface
public interface LongConditional extends Conditional {
boolean apply(long... values);
}
// @FunctionalInterface
public interface StringConditional extends Conditional {
boolean apply(String... values);
}
// @FunctionalInterface
public interface BooleanConditional extends Conditional {
boolean apply(boolean... values);
}
// @FunctionalInterface
public interface DoubleFunction extends Function {
double NULL = DoubleSeries.NULL;
double apply(double... values);
}
// @FunctionalInterface
public interface LongFunction extends Function {
long NULL = LongSeries.NULL;
long apply(long... values);
}
// @FunctionalInterface
public interface StringFunction extends Function {
String NULL = StringSeries.NULL;
String apply(String... values);
}
// @FunctionalInterface
public interface BooleanFunction extends Function {
boolean apply(boolean... values);
}
// @FunctionalInterface
public interface BooleanFunctionEx extends Function {
byte TRUE = BooleanSeries.TRUE;
byte FALSE = BooleanSeries.FALSE;
byte NULL = BooleanSeries.NULL;
byte apply(byte... values);
}
/**
* Helper container for references generated by grouping
*/
public static final class Bucket {
final int[] fromIndex;
Bucket(int[] fromIndex) {
this.fromIndex = fromIndex;
}
public int size() {
return this.fromIndex.length;
}
}
/**
* Base class for specialized Series builders
*/
public static abstract class Builder {
public abstract Series build();
public abstract Builder addSeries(Collection<Series> series);
public Builder addSeries(Series... series) {
return this.addSeries(Arrays.asList(series));
}
}
/**
* Grouping container referencing a single series. Holds group keys and the indices of group
* elements in the source series. Enables aggregation with custom user functions.
*/
public static final class SeriesGrouping {
final Series keys;
final Series source;
final List<Bucket> buckets;
SeriesGrouping(Series keys, Series source, List<Bucket> buckets) {
if(keys.size() != buckets.size())
throw new IllegalArgumentException("key series and bucket count must be equal");
this.keys = keys;
this.source = source;
this.buckets = buckets;
}
SeriesGrouping(Series source) {
this.keys = LongSeries.buildFrom();
this.source = source;
this.buckets = Collections.emptyList();
}
/**
* Applies index-based groups to a different series. Used by DataFrame for grouping across
* multiple series.
*
* @param s other series
* @return SeriesGrouping with different size
*/
SeriesGrouping applyTo(Series s) {
return new SeriesGrouping(this.keys, s, this.buckets);
}
/**
* Returns the number of groups
*
* @return group count
*/
public int size() {
return this.keys.size();
}
/**
* Returns the keys of each group in the container as series.
*
* @return key series
*/
public Series keys() {
return this.keys;
}
/**
* Returns the source series this grouping applies to.
*
* @return source series
*/
public Series source() {
return this.source;
}
/**
* Returns {@code true} if the grouping container does not hold any groups.
*
* @return {@code true} is empty, {@code false} otherwise.
*/
public boolean isEmpty() {
return this.keys.isEmpty();
}
/**
* Applies {@code function} as aggregation function to all values per group and
* returns the result as a new DataFrame with the number of elements equal to the size
* of the key series.
* If the series' native types do not match the required input type of {@code function},
* the series are converted transparently. The native type of the aggregated series is
* determined by {@code function}'s output type.
*
* @param function aggregation function to map to each grouped series
* @return grouped aggregation series
*/
public DataFrame aggregate(Function function) {
Builder builder = this.source.getBuilder();
for(Bucket b : this.buckets) {
builder.addSeries(this.source.project(b.fromIndex).aggregate(function));
}
return makeAggregate(this.keys, builder.build());
}
/**
* Counts the number of elements in each group and returns the result as a new DataFrame
* with the number of elements equal to the size of the key series.
*
* @return grouped aggregation series
*/
public DataFrame count() {
LongSeries.Builder builder = LongSeries.builder();
for(Bucket b : this.buckets) {
builder.addValues(b.size());
}
return makeAggregate(this.keys, builder.build());
}
static DataFrame makeAggregate(Series keys, Series values) {
DataFrame df = new DataFrame();
df.addSeries(GROUP_KEY, keys);
df.addSeries(GROUP_VALUE, values);
return df;
}
}
/**
* Helper container for index-pairs generated by join logic
*/
static final class JoinPair {
final int left;
final int right;
public JoinPair(int left, int right) {
this.left = left;
this.right = right;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
JoinPair joinPair = (JoinPair) o;
return (left == joinPair.left) && (right == joinPair.right);
}
@Override
public int hashCode() {
int result = left;
result = 31 * result + right;
return result;
}
}
/* *************************************************************************
* Public abstract interface
* *************************************************************************/
/**
* Returns the number of elements contained in the series.
*
* <br/><b>NOTE:</b> {@code null} values count as elements.
*
* @return series size
*/
public abstract int size();
/**
* Returns the series' native type.
*
* @return series type
*/
public abstract SeriesType type();
/**
* Slices the series from index {@code from} (inclusive) to index {@code to}
* (exclusive) and returns the result as a series of the same native type.
*
* @param from start index (inclusive), must be >= 0
* @param to end index (exclusive), must be <= size
* @return sliced series copy
*/
public abstract Series slice(int from, int to);
/**
* Returns the value referenced by {@code index} as double. The value is converted
* transparently if the native type of the underlying series is different. The
* {@code index} must be between {@code 0} and the size of the series.
*
* @param index index of value
* @throws IndexOutOfBoundsException if index is outside the series bounds
* @return double value
*/
public abstract double getDouble(int index);
/**
* Returns the value referenced by {@code index} as long. The value is converted
* transparently if the native type of the underlying series is different. The
* {@code index} must be between {@code 0} and the size of the series.
*
* @param index index of value
* @throws IndexOutOfBoundsException if index is outside the series bounds
* @return long value
*/
public abstract long getLong(int index);
/**
* Returns the value referenced by {@code index} as byte (tri-state boolean).
* The value is converted transparently if the native type of the underlying
* series is different. The {@code index} must be between {@code 0} and the
* size of the series.
*
* @param index index of value
* @throws IndexOutOfBoundsException if index is outside the series bounds
* @return byte value
*/
public abstract byte getBoolean(int index);
/**
* Returns the value referenced by {@code index} as String. The value is converted
* transparently if the native type of the underlying series is different. The
* {@code index} must be between {@code 0} and the size of the series.
*
* @param index index of value
* @throws IndexOutOfBoundsException if index is outside the series bounds
* @return string value
*/
public abstract String getString(int index);
/**
* Returns {@code true} if the value referenced by {@code index} is null. Otherwise,
* returns {@code false}.
*
* @param index index of value
* @throws IndexOutOfBoundsException if index is outside the series bounds
* @return {@code true} if value is null, otherwise {@code false}
*/
public abstract boolean isNull(int index);
/**
* Returns a human-readable String representation of the value referenced by {@code index}.
*
* @param index index of value
* @throws IndexOutOfBoundsException if index is outside the series bounds
* @return human-readable string representation
*/
public abstract String toString(int index);
/**
* Returns a copy of the series with values ordered in ascending order.
*
* <br/><b>NOTE:</b> BooleanSeries interprets {@code false} as smaller than {@code true}.
*
* @return sorted series copy
*/
public abstract Series sorted();
/**
* Returns a copy of the series with {@code null} values replaced by the series' default
* value.
*
* @return series copy with filled nulls
*/
public abstract Series fillNull();
/**
* Returns a new builder instance for the native type encapsulated by this series.
*
* @return series builder
*/
public abstract Builder getBuilder();
/**
* Returns a copy of the series with values replaced by {@code null} for every row in
* {@code filter} that is not {@code true}.
*
* @param mask series to filter by
* @return filtered series copy
*/
public abstract Series filter(BooleanSeries mask);
/* *************************************************************************
* Internal abstract interface
* *************************************************************************/
/**
* Returns projection of the series.
*
* <br/><b>NOTE:</b> fromIndex <= -1 is filled with {@code null}.
* <br/><b>NOTE:</b> array with length 0 returns empty series.
* <br/><b>NOTE:</b> could replace {@code slice(int, int)}, but low performance
*
* @param fromIndex array with indices to project from (must be <= series size)
* @return series projection
*/
abstract Series project(int[] fromIndex);
/**
* Compares values across two series with potentially different native types based on index.
* If the types are different the values in {@code that} are transparently converted to the
* native type of this series.
*
* <br/><b>Note:</b> the transparent conversion may cause different behavior between
* {@code this.compare(that)} and {@code that.compare(this)}.
*
* @param that other series with same native type (may reference itself)
* @param indexThis index in this series
* @param indexThat index in the other series
* @return 0 if the referenced values are equal, -1 if {@code this} is less than {@code that}, 1 otherwise
*/
abstract int compare(Series that, int indexThis, int indexThat);
/**
* Returns an array of indices with a size equal to the series size, such that the values
* references by the indices are sorted in ascending order.
*
* <br/><b>NOTE:</b> output can be used directly by {@code project()} to create a sorted copy of the series.
*
* @return indices of sorted values
*/
abstract int[] sortedIndex();
/* *************************************************************************
* Public interface
* *************************************************************************/
/**
* Returns series {@code s} converted to type {@code type} unless native type matches already.
*
* @param type target type
* @return converted series
*/
public final Series get(Series.SeriesType type) {
switch(type) {
case DOUBLE:
return this.getDoubles();
case LONG:
return this.getLongs();
case BOOLEAN:
return this.getBooleans();
case STRING:
return this.getStrings();
default:
throw new IllegalArgumentException(String.format("Unknown series type '%s'", type));
}
}
/**
* Returns a the series as DoubleSeries. The underlying series is converted
* transparently if the series' native type is different.
*
* @return DoubleSeries equivalent
*/
public DoubleSeries getDoubles() {
double[] values = new double[this.size()];
for(int i=0; i<this.size(); i++) {
values[i] = this.getDouble(i);
}
return DoubleSeries.buildFrom(values);
}
/**
* Returns the series as LongSeries. The underlying series is converted
* transparently if the series' native type is different.
*
* @return LongSeries equivalent
*/
public LongSeries getLongs() {
long[] values = new long[this.size()];
for(int i=0; i<this.size(); i++) {
values[i] = this.getLong(i);
}
return LongSeries.buildFrom(values);
}
/**
* Returns the series as BooleanSeries. The underlying series is converted
* transparently if the series' native type is different.
*
* @return BooleanSeries equivalent
*/
public BooleanSeries getBooleans() {
byte[] values = new byte[this.size()];
for(int i=0; i<this.size(); i++) {
values[i] = this.getBoolean(i);
}
return BooleanSeries.buildFrom(values);
}
/**
* Returns the series as StringSeries. The underlying series is converted
* transparently if the series' native type is different.
*
* @return StringSeries equivalent
*/
public StringSeries getStrings() {
String[] values = new String[this.size()];
for(int i=0; i<this.size(); i++) {
values[i] = this.getString(i);
}
return StringSeries.buildFrom(values);
}
/**
* Returns as copy of the series with the same native type.
*
* @return series copy
*/
public Series copy() {
return this.slice(0, this.size());
}
/**
* Returns a copy of the series with values from {@code other}
* appended at the end. If {@code other} has different native types they are
* converted transparently.
*
* <br/><b>NOTE:</b> newSize = oldSize + otherSize
*
* @param other other series to append at the end
* @return concatenated series
*/
public Series append(Series... other) {
return this.getBuilder().addSeries(this).addSeries(other).build();
}
/**
* Fills {@code null} values in the series with a copy of the last valid value. The index
* is traversed in ascending order. If the last valid value does not exist (such as for the
* first element in a series) it is left at {@code null}.
*
* @return forward filled series
*/
public Series fillNullForward() {
int lastValueIndex = -1;
int[] fromIndex = new int[this.size()];
for(int i=0; i<this.size(); i++) {
if(!isNull(i))
lastValueIndex = i;
fromIndex[i] = lastValueIndex;
}
return this.project(fromIndex);
}
/**
* Fills {@code null} values in the series with a copy of the last valid value. The index
* is traversed in descending order. If the last valid value does not exist (such as for the
* last element in a series) it is left at {@code null}.
*
* @return backward filled series
*/
public Series fillNullBackward() {
int lastValueIndex = -1;
int[] fromIndex = new int[this.size()];
for(int i=this.size()-1; i>=0; i--) {
if(!isNull(i))
lastValueIndex = i;
fromIndex[i] = lastValueIndex;
}
return this.project(fromIndex);
}
/**
* Returns a copy of the series with all values' indices
* shifted by {@code offset} positions while
* leaving the series size unchanged. Values shifted outside to upper (or lower)
* bounds of the series are dropped. Vacated positions are padded with {@code null}.
*
* <br/><b>NOTE:</b> for each value, newIndex = oldIndex + offset
*
* @param offset offset to shift values by. Can be positive or negative.
* @return shifted series copy
*/
// NOTE: override for performance
public Series shift(int offset) {
int[] fromIndex = new int[this.size()];
int from = 0;
for(int i=0; i<Math.min(offset, this.size()); i++) {
fromIndex[from++] = -1;
}
for(int i=Math.max(offset, 0); i<Math.max(Math.min(this.size() + offset, this.size()), 0); i++) {
fromIndex[from++] = i - offset;
}
for(int i=Math.max(this.size() + offset, 0); i<this.size(); i++) {
fromIndex[from++] = -1;
}
return this.project(fromIndex);
}
/**
* Returns {@code true} is there are no values in the series. Otherwise returns {@code false}.
*
* <br/><b>NOTE:</b> {@code null} values count as elements.
*
* @return {@code true} if empty, {@code false} otherwise
*/
public final boolean isEmpty() {
return this.size() <= 0;
}
/**
* Returns {@code true} if the series contains at least one {@code null}. Otherwise
* returns {@code false}.
*
* @return {@code true} if empty, {@code false} otherwise
*/
public final boolean hasNull() {
return this.count() < this.size();
}
/**
* Returns the number of non-null values in the series.
*
* @return count of non-null values
*/
public final int count() {
int countNotNull = 0;
for(int i=0; i<this.size(); i++)
if(!this.isNull(i))
countNotNull++;
return countNotNull;
}
/**
* Returns a copy of the series containing at maximum the first {@code n} elements of the series.
* If {@code n} is larger than the series size, the entire series is returned. Additional values
* to make up the difference between {@code n} and the size are not padded.
*
* @param n number of elements
* @return series copy with at most the first {@code n} elements
*/
public Series head(int n) {
return this.slice(0, Math.min(n, this.size()));
}
/**
* Returns a copy of the series containing at maximum the last {@code n} elements of the series.
* If {@code n} is larger than the series size, the entire series is returned. Additional values
* to make up the difference between {@code n} and the size are not padded.
*
* @param n number of elements
* @return series copy with at most the last {@code n} elements
*/
public Series tail(int n) {
int len = this.size();
return this.slice(len - Math.min(n, len), len);
}
/**
* Returns a copy of the series omitting any elements before index {@code n}.
* If {@code n} is {@code 0}, the entire series is returned. If {@code n} is greater than
* the series size, an empty series is returned.
*
* @param from start index of copy (inclusive)
* @return series copy with elements from index {@code from}.
*/
public Series sliceFrom(int from) {
return this.slice(Math.max(from, 0), this.size());
}
/**
* Returns a copy of the series omitting any elements equal to or after index {@code n}.
* If {@code n} is equal or greater than the series size, the entire series is returned.
* If {@code n} is {@code 0}, an empty series is returned.
*
* @param to end index of copy (exclusive)
* @return series copy with elements before from index {@code from}.
*/
public Series sliceTo(int to) {
return this.slice(0, Math.min(to, this.size()));
}
/**
* Returns a copy of the series with elements in reverse order from the original series.
*
* @return reversed series
*/
public Series reverse() {
int[] fromIndex = new int[this.size()];
for (int i = 0; i < fromIndex.length; i++) {
fromIndex[i] = fromIndex.length - i - 1;
}
return this.project(fromIndex);
}
/**
* Returns a copy of the series with each distinct value of the
* source series appearing exactly once. The values are further sorted in ascending order.
*
* @return sorted series copy with distinct unique values
*/
public Series unique() {
if(this.size() <= 1)
return this;
Series sorted = this.sorted();
List<Integer> indices = new ArrayList<>();
indices.add(0);
for(int i=1; i<this.size(); i++) {
if(sorted.compare(sorted, i-1, i) != 0)
indices.add(i);
}
int[] fromIndex = ArrayUtils.toPrimitive(indices.toArray(new Integer[indices.size()]));
return sorted.project(fromIndex);
}
/**
* Returns a copy of the series omitting any {@code null} values.
*
* @return series copy without {@code nulls}
*/
public Series dropNull() {
int[] fromIndex = new int[this.size()];
int count = 0;
for(int i=0; i<this.size(); i++) {
if(!isNull(i))
fromIndex[count++] = i;
}
return this.project(Arrays.copyOf(fromIndex, count));
}
/**
* Returns a BooleanSeries which contains a value indicating the null-equivalence for each
* value in the original series (this).
*
* @return boolean series indicating null-equivalence of each value
*/
public BooleanSeries isNull() {
byte[] values = new byte[this.size()];
for(int i=0; i<this.size(); i++) {
values[i] = BooleanSeries.valueOf(this.isNull(i));
}
return BooleanSeries.buildFrom(values);
}
/**
* Returns a copy of the series with values replaced by {@code null} for every row in
* the result of applying {@code conditional} to the series that is not {@code true}.
*
* @param conditional conditional to apply and filter by
* @return filtered series copy
*/
public Series filter(Conditional conditional) {
return this.filter(this.map(conditional));
}
//
// NOTE: co-variant method messiness
//
/**
* Applies {@code function} to the series row by row and returns the results as a new series.
* If the series' native types do not match the required input type of {@code function},
* the series are converted transparently. The native type of the returned series is
* determined by {@code function}'s output type.
*
* @param function function to apply to each row
* @param series series to apply function to
* @return series with evaluation results
*/
public static Series map(Function function, Series... series) {
if(function instanceof DoubleFunction) {
return DoubleSeries.map((DoubleFunction)function, series);
} else if(function instanceof LongFunction) {
return LongSeries.map((LongFunction)function, series);
} else if(function instanceof StringFunction) {
return StringSeries.map((StringFunction)function, series);
} else if(function instanceof BooleanFunction) {
return BooleanSeries.map((BooleanFunction)function, series);
} else if(function instanceof BooleanFunctionEx) {
return BooleanSeries.map((BooleanFunctionEx)function, series);
} else if(function instanceof DoubleConditional) {
return DoubleSeries.map((DoubleConditional)function, series);
} else if(function instanceof LongConditional) {
return LongSeries.map((LongConditional)function, series);
} else if(function instanceof StringConditional) {
return StringSeries.map((StringConditional)function, series);
} else if(function instanceof BooleanConditional) {
return BooleanSeries.map((BooleanConditional)function, series);
}
throw new IllegalArgumentException(String.format("Unknown function type '%s'", function.getClass()));
}
/**
* Applies {@code function} to the series row by row and returns the results as a new series.
* If the series' native type does not match the required input type of {@code function},
* the series is converted transparently. The native type of the returned series is
* determined by {@code function}'s output type.
*
* @param function function to map to each element in the series
* @return series with evaluation results
*/
public final Series map(Function function) {
return map(function, this);
}
/**
* @see Series#map(Function)
*/
public final DoubleSeries map(DoubleFunction function) {
return (DoubleSeries)map(function, this);
}
/**
* @see Series#map(Function)
*/
public final LongSeries map(LongFunction function) {
return (LongSeries)map(function, this);
}
/**
* @see Series#map(Function)
*/
public final StringSeries map(StringFunction function) {
return (StringSeries)map(function, this);
}
/**
* @see Series#map(Function)
*/
public final BooleanSeries map(BooleanFunction function) {
return (BooleanSeries)map(function, this);
}
/**
* @see Series#map(Function)
*/
public final BooleanSeries map(BooleanFunctionEx function) {
return (BooleanSeries)map(function, this);
}
/**
* @see Series#map(Function)
*/
public final BooleanSeries map(Conditional conditional) {
return (BooleanSeries)map(conditional, this);
}
//
// NOTE: co-variant method messiness
//
/**
* Applies {@code function} as aggregation function to all values in the series at once and
* returns the result as a new series with a single element.
* If the series' native type does not match the required input type of {@code function},
* the series is converted transparently. The native type of the returned series is
* determined by {@code function}'s output type.
*
* @param function aggregation function to map to the series
* @return single element series
*/
public final Series aggregate(Function function) {
if(function instanceof DoubleFunction) {
return DoubleSeries.aggregate((DoubleFunction)function, this);
} else if(function instanceof LongFunction) {
return LongSeries.aggregate((LongFunction)function, this);
} else if(function instanceof StringFunction) {
return StringSeries.aggregate((StringFunction)function, this);
} else if(function instanceof BooleanFunction) {
return BooleanSeries.aggregate((BooleanFunction)function, this);
} else if(function instanceof BooleanFunctionEx) {
return BooleanSeries.aggregate((BooleanFunctionEx)function, this);
} else if(function instanceof DoubleConditional) {
return DoubleSeries.aggregate((DoubleConditional)function, this);
} else if(function instanceof LongConditional) {
return LongSeries.aggregate((LongConditional)function, this);
} else if(function instanceof StringConditional) {
return StringSeries.aggregate((StringConditional)function, this);
} else if(function instanceof BooleanConditional) {
return BooleanSeries.aggregate((BooleanConditional)function, this);
}
throw new IllegalArgumentException(String.format("Unknown function type '%s'", function.getClass()));
}
/**
* @see Series#aggregate(Function)
*/
public final DoubleSeries aggregate(DoubleFunction function) {
return (DoubleSeries)this.aggregate((Function)function);
}
/**
* @see Series#aggregate(Function)
*/
public final LongSeries aggregate(LongFunction function) {
return (LongSeries)this.aggregate((Function)function);
}
/**
* @see Series#aggregate(Function)
*/
public final StringSeries aggregate(StringFunction function) {
return (StringSeries)this.aggregate((Function)function);
}
/**
* @see Series#aggregate(Function)
*/
public final BooleanSeries aggregate(BooleanFunction function) {
return (BooleanSeries)this.aggregate((Function)function);
}
/**
* @see Series#aggregate(Function)
*/
public final BooleanSeries aggregate(BooleanFunctionEx function) {
return (BooleanSeries)this.aggregate((Function)function);
}
/**
* @see Series#aggregate(Function)
*/
public final BooleanSeries aggregate(Conditional conditional) {
return (BooleanSeries)this.aggregate((Function)conditional);
}
/**
* Returns a SeriesGrouping based on value. Elements are grouped into separate buckets for each
* distinct value in the series.
*
* <br/><b>NOTE:</b> the resulting keys are equivalent to calling {@code unique()} on the series.
*
* @return grouping by value
*/
public final SeriesGrouping groupByValue() {
if(this.isEmpty())
return new SeriesGrouping(this);
List<Bucket> buckets = new ArrayList<>();
int[] sref = this.sortedIndex();
int bucketOffset = 0;
for(int i=1; i<sref.length; i++) {
if(this.compare(this, sref[i-1], sref[i]) != 0) {
int[] fromIndex = Arrays.copyOfRange(sref, bucketOffset, i);
buckets.add(new Bucket(fromIndex));
bucketOffset = i;
}
}
int[] fromIndex = Arrays.copyOfRange(sref, bucketOffset, sref.length);
buckets.add(new Bucket(fromIndex));
// keys from buckets
int[] keyIndex = new int[buckets.size()];
int i = 0;
for(Bucket b : buckets) {
keyIndex[i++] = b.fromIndex[0];
}
return new SeriesGrouping(this.project(keyIndex), this, buckets);
}
/**
* Returns a SeriesGrouping based on element count per buckets. Elements are grouped into buckets
* based on a greedy algorithm with fixed bucket size. The size of all buckets (except for the
* last) is guaranteed to be equal to {@code bucketSize}.
*
* @param bucketSize maximum number of elements per bucket
* @return grouping by element count
*/
public final SeriesGrouping groupByCount(int bucketSize) {
if(bucketSize <= 0)
throw new IllegalArgumentException("bucketSize must be greater than 0");
if(this.isEmpty())
return new SeriesGrouping(this);
bucketSize = Math.min(bucketSize, this.size());
int numBuckets = (this.size() - 1) / bucketSize + 1;
long[] keys = new long[numBuckets];
List<Bucket> buckets = new ArrayList<>();
for(int i=0; i<numBuckets; i++) {
int from = i*bucketSize;
int to = Math.min((i+1)*bucketSize, this.size());
int[] fromIndex = new int[to-from];
for(int j=0; j<fromIndex.length; j++) {
fromIndex[j] = j + from;
}
buckets.add(new Bucket(fromIndex));
keys[i] = i;
}
return new SeriesGrouping(DataFrame.toSeries(keys), this, buckets);
}
/**
* Returns a SeriesGrouping based on a fixed number of buckets. Elements are grouped into buckets
* based on a greedy algorithm to approximately evenly fill buckets. The number of buckets
* is guaranteed to be equal to {@code partitionCount} even if some remain empty.
*
* @param partitionCount number of buckets
* @return grouping by bucket count
*/
public final SeriesGrouping groupByPartitions(int partitionCount) {
if(partitionCount <= 0)
throw new IllegalArgumentException("partitionCount must be greater than 0");
if(this.isEmpty())
return new SeriesGrouping(this);
double perPartition = this.size() / (double)partitionCount;
long[] keys = new long[partitionCount];
List<Bucket> buckets = new ArrayList<>();
for(int i=0; i<partitionCount; i++) {
int from = (int)Math.round(i * perPartition);
int to = (int)Math.round((i+1) * perPartition);
int[] fromIndex = new int[to-from];
for(int j=0; j<fromIndex.length; j++) {
fromIndex[j] = j + from;
}
buckets.add(new Bucket(fromIndex));
keys[i] = i;
}
return new SeriesGrouping(DataFrame.toSeries(keys), this, buckets);
}
/**
* Returns an (overlapping) SeriesGrouping base on a moving window size. Elements are grouped
* into overlapping buckets in sequences of {@code windowSize} consecutive items. The number
* of buckets is guaranteed to be equal to {@code series_size - moving_window_size + 1}, or
* 0 if the window size is greater than the series size.
*
* @param windowSize size of moving window
* @return grouping by moving window
*/
public final SeriesGrouping groupByMovingWindow(int windowSize) {
if(windowSize <= 0)
throw new IllegalArgumentException("windowSize must be greater than 0");
if(this.size() < windowSize)
return new SeriesGrouping(this);
int windowCount = this.size() - windowSize + 1;
long[] keys = new long[windowCount];
List<Bucket> buckets = new ArrayList<>();
for(int i=0; i<windowCount; i++) {
keys[i] = i;
int[] fromIndex = new int[windowSize];
for(int j=0; j<windowSize; j++) {
fromIndex[j] = i + j;
}
buckets.add(new Bucket(fromIndex));
}
return new SeriesGrouping(DataFrame.toSeries(keys), this, buckets);
}
/**
* Returns a concatenation of {@code series} as a new series with a native type equal
* to the first series. If subsequent series have different native types they are
* converted transparently.
*
* @param series series to concatenate
* @return concatenated series
*/
public static Series concatenate(Series... series) {
if(series.length <= 0)
throw new IllegalArgumentException("Must concatenate at least one series");
Series first = series[0];
Series[] rest = Arrays.copyOfRange(series, 1, series.length);
return first.append(rest);
}
/* *************************************************************************
* Internal interface
* *************************************************************************/
/**
* Returns index tuples (pairs) for a join performed based on value.
*
* <br/><b>NOTE:</b> the implementation uses merge join. Thus, the index pairs reference
* values in ascending order.
*
* @see Series#compare(Series, int, int)
*
* @param other series to match values against
* @param type type of join to perform
* @return list of index pairs for join
*/
List<JoinPair> join(Series other, JoinType type) {
// NOTE: merge join
int[] lref = this.sortedIndex();
int[] rref = other.sortedIndex();
List<JoinPair> pairs = new ArrayList<>();
int i = 0;
int j = 0;
while(i < this.size() || j < other.size()) {
if(j >= other.size() || (i < this.size() && this.compare(other, lref[i], rref[j]) < 0)) {
switch(type) {
case LEFT:
case OUTER:
pairs.add(new JoinPair(lref[i], -1));
default:
}
i++;
} else if(i >= this.size() || (j < other.size() && this.compare(other, lref[i], rref[j]) > 0)) {
switch(type) {
case RIGHT:
case OUTER:
pairs.add(new JoinPair(-1, rref[j]));
default:
}
j++;
} else if(i < this.size() && j < other.size()) {
// generate cross product
// count similar values on the left
int lcount = 1;
while(i + lcount < this.size() && this.compare(this, lref[i + lcount], lref[i + lcount - 1]) == 0) {
lcount++;
}
// count similar values on the right
int rcount = 1;
while(j + rcount < other.size() && other.compare(other, rref[j + rcount], rref[j + rcount - 1]) == 0) {
rcount++;
}
for(int l=0; l<lcount; l++) {
for(int r=0; r<rcount; r++) {
pairs.add(new JoinPair(lref[i + l], rref[j + r]));
}
}
i += lcount;
j += rcount;
}
}
return pairs;
}
/* **************************************************************************
* Code grave
***************************************************************************/
// NOTE: too slow
// public Series sorted() {
// return this.project(this.sortedIndex());
// }
// NOTE: too slow
// int[] sortedIndex() {
// Integer[] fromIndex = new Integer[this.size()];
// for(int i=0; i<this.size(); i++)
// fromIndex[i] = i;
//
// final Series s = this;
// Arrays.sort(fromIndex, new Comparator<Integer>() {
// @Override
// public int compare(Integer o1, Integer o2) {
// return s.compare(s, o1, o2);
// }
// });
//
// return ArrayUtils.toPrimitive(fromIndex);
// }
}