/* ---------------------------------------------------------------------
* Numenta Platform for Intelligent Computing (NuPIC)
* Copyright (C) 2014, Numenta, Inc. Unless you have an agreement
* with Numenta, Inc., for a separate license for this software code, the
* following terms and conditions apply:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero Public License version 3 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Affero Public License for more details.
*
* You should have received a copy of the GNU Affero Public License
* along with this program. If not, see http://www.gnu.org/licenses.
*
* http://numenta.org/licenses/
* ---------------------------------------------------------------------
*/
package org.numenta.nupic.network.sensor;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.BiConsumer;
import java.util.function.BiFunction;
import java.util.function.BinaryOperator;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.IntFunction;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.function.ToDoubleFunction;
import java.util.function.ToIntFunction;
import java.util.function.ToLongFunction;
import java.util.stream.Collector;
import java.util.stream.DoubleStream;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.numenta.nupic.util.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* <p>
* Specialized {@link Stream} for CSV (Comma Separated Values)
* stream processing. Configure this Stream with a batch size and
* a header length, and just treat as normal {@link Stream}.
* </p>
* <p>
* To create a {@code BatchedCsvStream}, call {@link BatchedCsvStream#batch(Stream, int, boolean, int)}
* handing it the underlying Stream to handle, the batch size, whether it should be parallelized,
* and the size of the header and it will return a Stream that will handle
* batching when "isParallel" is set to true. When "isParallel" is set to false, no batching
* takes place because there would be no point.
* </p>
* <p>
* A side effect to be aware of when batching is the insertion of a "sequenceNumber" to the first column
* of every line. This sequenceNumber describes the "encounter order" of the line in question
* and can reliably be used to "re-order" the entire stream at a later point.
* </p>
*
* <p>
* <pre>
* To reorder the Stream use code such as:
* Stream thisStream;
* List<String> sortedList = thisStream.sorted(
* (String[] i, String[] j) -> {
* return Integer.valueOf(i[0]).compareTo(Integer.valueOf(j[0]));
* }).collect(Collectors.toList());
* </pre>
*
*
* The batching implemented is pretty straight forward. The underlying iterator is
* advanced to i + min(batchSize, remainingCount), where each line is fed into
* a queue of Objects, the {@link BatchSpliterator#tryAdvance(Consumer)}
* is called with a {@link BatchSpliterator.SequencingConsumer} which inserts
* the sequenceNumber into the head of the line array after calling
* {@link System#arraycopy(Object, int, Object, int, int)} to increase its size.
*
*
*
* @author David Ray
*
* @param <T> The Type of data on each line of this Stream (String[] for this implementation)
*/
public class BatchedCsvStream<T> implements MetaStream<T>, Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
// TOP TWO CLASSES ARE THE BatchSpliterator AND THE BatchedCsvHeader //
// See main() at bottom for localized mini-test
//////////////////////////////////////////////////////////////
// Inner Classes //
//////////////////////////////////////////////////////////////
/**
* The internal batching {@link Spliterator} implementation.
* This does all the magic of splitting the stream into "jobs"
* that each cpu core can handle.
*
* @author David Ray
* @see Header
* @see BatchedCsvStream
*/
private static class BatchSpliterator implements Spliterator<String[]> {
private final int batchSize;
private final int characteristics;
private int sequenceNum;
private long est;
private BatchedCsvStream<String[]> csv;
private transient Spliterator<String[]> spliterator;
/**
* Creates a new BatchSpliterator
*
* @param characteristics the bit flags indicating the different
* {@link Spliterator} configurations
* @param batchSize the size of each "chunk" to hand off to
* a Thread
* @param est estimation-only, of the remaining size
*/
public BatchSpliterator(int characteristics, int batchSize, long est) {
this.characteristics = characteristics | Spliterator.SUBSIZED;
this.batchSize = batchSize;
this.est = est;
}
/**
* Called internally to store the reference to the parent {@link BatchedCsvStream}.
*
* @param csv the parent {@code BatchedCsvStream}
* @return this {@code BatchSpliterator}
*/
private BatchSpliterator setCSV(BatchedCsvStream<String[]> csv) {
this.csv = csv;
return this;
}
/**
* Called internally to store a reference to the functional {@link Spliterator}
* @param toWrap
* @return
*/
private BatchSpliterator setToWrap(Spliterator<String[]> toWrap) {
this.spliterator = toWrap;
return this;
}
/**
* Overridden to call the delegate {@link Spliterator} and update
* this Spliterator's sequence number.
*
* @return a flag indicating whether there is a value available
*/
@Override
public boolean tryAdvance(Consumer<? super String[]> action) {
boolean hasNext;
if(hasNext = spliterator.tryAdvance(action)) {
sequenceNum++;
}
return hasNext;
}
/**
* Little cousin to {@link #tryAdvance(Consumer)} which is called
* after the spliterator is depleted to see if there are any remaining
* values.
*/
@Override
public void forEachRemaining(Consumer<? super String[]> action) {
spliterator.forEachRemaining(action);
}
/**
* Called by the Fork/Join mechanism to divide and conquer by
* creating {@link Spliterator}s for each thread. This method
* returns a viable Spliterator over the configured number of
* lines. see {@link #batchSize}
*/
@Override
public Spliterator<String[]> trySplit() {
final SequencingConsumer holder = csv.isArrayType ? new SequencingArrayConsumer() : new SequencingConsumer();
//This is the line that makes this implementation tricky due to
//a side effect in the purpose of this method. The try advance
//actually advances so when it is called twice, (because it is
//used to query if there is a "next" also) we need to handle it
//for the first and last sequence. We also have to make sure our
//sequence number is being handled so that we can "re-order" the
//parallel pieces later. (They're inserted at the row-heads of each
//line).
if (!tryAdvance(holder)) {
return null;
}
csv.setBatchOp(true);
final Object[] lines = new Object[batchSize];
int j = 0;
do {
lines[j] = holder.value;
} while (++j < batchSize && tryAdvance(holder));
if (est != Long.MAX_VALUE) est -= j;
return Spliterators.spliterator(lines, 0, j, characteristics | SIZED);
}
/**
* Returns a specialized {@link Comparator} if the characteristics are set
* to {@link Spliterator#SORTED} and a call to {@link
* @return
*/
@Override
public Comparator<? super String[]> getComparator() {
if (hasCharacteristics(Spliterator.SORTED) && csv.isBatchOp) {
return (i, j) -> { return Long.valueOf(i[0]).compareTo(Long.valueOf(j[0])); };
}else if(csv.isBatchOp) {
return null;
}
throw new IllegalStateException();
}
@Override
public long estimateSize() {
return est;
}
@Override
public int characteristics() {
return characteristics;
}
class SequencingConsumer implements Consumer<String[]> {
String[] value;
@Override public void accept(String[] value) {
csv.isTerminal = true;
this.value = new String[value.length + 1];
System.arraycopy(value, 0, this.value, 1, value.length);
this.value[0] = String.valueOf(sequenceNum);
}
}
final class SequencingArrayConsumer extends SequencingConsumer implements Consumer<String[]> {
String[] value;
@Override public void accept(String[] value) {
csv.isTerminal = true;
this.value = new String[2];
this.value[0] = String.valueOf(sequenceNum);
this.value[1] = Arrays.toString(value).trim();
}
}
}
/**
* Implementation of the @FunctionalInterface {@link Header}
*
* @author David Ray
* @see Header
*/
public static class BatchedCsvHeader implements ValueList, Serializable {
private static final long serialVersionUID = 1L;
/** Container for the field values */
private Tuple[] headerValues;
/**
* Constructs a new {@code BatchedCsvHeader}
*
* @param lines List of csv strings
* @param configuredHeaderLength number of header rows
*/
public <T> BatchedCsvHeader(List<T> lines, int configuredHeaderLength) {
if((configuredHeaderLength < 1 || lines == null || lines.size() < 1) ||
(configuredHeaderLength > 1 && lines.size() != configuredHeaderLength)) {
throw new IllegalStateException("Actual Header was not the expected size: " +
(configuredHeaderLength < 1 ? "> 1" : configuredHeaderLength) +
", but was: " + (lines == null ? "null" : lines.size()));
}
headerValues = new Tuple[configuredHeaderLength];
for(int i = 0;i < headerValues.length;i++) {
headerValues[i] = new Tuple((Object[])lines.get(i));
}
}
/**
* Returns the array of values ({@link Tuple}) at the specified
* index.
*
* @param index the index of the Tuple to be retrieved.
* @return
*/
public Tuple getRow(int index) {
if(index >= headerValues.length) {
return null;
}
return headerValues[index];
}
/**
* Returns the current number of lines in the header.
*
* @return
*/
public int size() {
return headerValues == null ? 0 : headerValues.length;
}
/**
* {@inheritDoc}
* @return
*/
public String toString() {
StringBuilder sb = new StringBuilder();
Stream.of(headerValues).forEach(l -> sb.append(l).append("\n"));
return sb.toString();
}
}
////////////////// End Inner Classes //////////////////////
//////////////////////////////////////////////////////////////
// Main Class //
//////////////////////////////////////////////////////////////
private static final transient Logger LOGGER = LoggerFactory.getLogger(BatchedCsvStream.class);
private Iterator<String[]> it;
private int fence;
private boolean isBatchOp;
private boolean isTerminal;
private boolean isArrayType;
private BatchedCsvHeader header;
private transient Stream<T> delegate;
private int headerStateTracker = 0;
/**
* Constructs a new {@code BatchedCsvStream}
*
* @param s the underlying JDK {@link Stream}
* @param headerLength the number of header lines preceding the data.
* @see Header
*/
public BatchedCsvStream(Stream<String> s, int headerLength) {
this.it = s.map(line -> {
++headerStateTracker;
return line.split("[\\s]*,[\\s]*", -1);
}).iterator();
this.fence = headerLength;
makeHeader();
LOGGER.debug("Created BatchedCsvStream");
}
/**
* Called internally to create this csv stream's header
*/
private void makeHeader() {
List<String[]> contents = new ArrayList<>();
int i = 0;
while(i++ < fence) {
String[] h = it.next();
contents.add(h);
}
this.header = new BatchedCsvHeader(contents, fence);
this.isArrayType = isArrayType();
if(LOGGER.isDebugEnabled()) {
LOGGER.debug("Created Header:");
for(String[] h : contents) {
LOGGER.debug("\t" + Arrays.toString(h));
}
LOGGER.debug("Successfully created BatchedCsvHeader.");
}
}
/**
* <p>
* Returns a flag indicating whether the underlying stream has had
* a terminal operation called on it, indicating that it can no longer
* have operations built up on it.
* </p><p>
* The "terminal" flag if true does not indicate that the stream has reached
* the end of its data, it just means that a terminating operation has been
* invoked and that it can no longer support intermediate operation creation.
*
* @return true if terminal, false if not.
*/
@Override
public boolean isTerminal() {
return this.isTerminal;
}
/**
* Returns a flag indicating whether this {@link Stream} is
* currently batching its operations.
*
* @return
*/
public boolean isBatchOp() {
return isBatchOp;
}
/**
* Sets a flag indicating that whether this {@code BatchedCsvStream} is
* currently batching its operations.
*
* @param b
*/
public void setBatchOp(boolean b) {
this.isBatchOp = b;
}
/**
* Returns the {@link BatchedCsvHeader}
* @return
*/
public BatchedCsvHeader getHeader() {
return header;
}
/**
* Returns the portion of the {@link Stream} <em>not containing</em>
* the header. To obtain the header, refer to: {@link #getHeader()}
*
* @param parallel flag indicating whether the underlying
* stream should be parallelized.
* @return the stream continuation
* @see Header
* @see BatchedCsvHeader
* @see #getHeader()
*/
private Stream<String[]> continuation(boolean parallel) {
if(it == null) {
throw new IllegalStateException("You must first create a BatchCsvStream by calling batch(Stream, int, boolean, int)");
}
return StreamSupport.stream(
Spliterators.spliteratorUnknownSize(
parallel ? it : isArrayType ? getArraySequenceIterator(it) : getSequenceIterator(it), // Return a sequencing iterator if not parallel
// otherwise the Spliterator handles the sequencing
// through the special SequencingConsumer
Spliterator.ORDERED | Spliterator.NONNULL | Spliterator.IMMUTABLE),
parallel);
}
/**
* Returns a flag indicating whether the input field is an array
* @return
*/
private boolean isArrayType() {
if(getHeader().headerValues.length < 3) {
return false;
}
for(Object o : getHeader().headerValues[1].all()) {
if(o.toString().toLowerCase().equals("sarr") || o.toString().toLowerCase().equals("darr")) {
return isArrayType = true;
}
}
return false;
}
/**
* Called internally to return a sequencing iterator when this stream
* is configured to be non-parallel because it will skip the BatchedSpliterator
* code which internally does the sequencing. So we must provide it here when
* not parallel.
*
* @param toWrap the original iterator to wrap
* @return
*/
private Iterator<String[]> getSequenceIterator(final Iterator<String[]> toWrap) {
return new Iterator<String[]>() {
private Iterator<String[]> delegate = toWrap;
private int seq = 0;
@Override
public boolean hasNext() {
return delegate.hasNext();
}
@Override
public String[] next() {
isTerminal = true;
String[] value = delegate.next();
String[] retVal = new String[value.length + 1];
System.arraycopy(value, 0, retVal, 1, value.length);
retVal[0] = String.valueOf(seq++);
return retVal;
}
};
}
/**
* Called internally to return a sequencing iterator when this stream
* is configured to be non-parallel because it will skip the BatchedSpliterator
* code which internally does the sequencing. So we must provide it here when
* not parallel.
*
* This method differs from {@link #getSequenceIterator(Iterator)} by converting
* the parsed String[] to a single string in the 2 index.
*
* @param toWrap the original iterator to wrap
* @return
*/
private Iterator<String[]> getArraySequenceIterator(final Iterator<String[]> toWrap) {
return new Iterator<String[]>() {
private Iterator<String[]> delegate = toWrap;
private int seq = 0;
@Override
public boolean hasNext() {
return delegate.hasNext();
}
@Override
public String[] next() {
isTerminal = true;
String[] value = delegate.next();
String[] retVal = new String[2];
retVal[0] = String.valueOf(seq++);
retVal[1] = Arrays.toString(value).trim();
return retVal;
}
};
}
/**
* Returns the delegate underlying {@link Stream}.
* @return stream
*/
@SuppressWarnings({ "unchecked" })
public Stream<String[]> stream() {
return (Stream<String[]>)this.delegate;
}
/**
* Initializes the new spliterator using the specified characteristics.
*
* @param csv the Stream from which to create the spliterator
* @param batchSize the "chunk" length to be processed by each Threaded task
* @param isParallel if true, batching will take place, otherwise not
* @param characteristics overrides the default characteristics of:
* {@link Spliterator#ORDERED},{@link Spliterator#NONNULL},
* {@link Spliterator#IMMUTABLE}, and <em>{@link Spliterator#SUBSIZED}.
* <p><b>WARNING:</b> This last characteristic [<b>SUBSIZED</b>] is <b>necessary</b> if batching is desired.</p></em>
* @return
*/
private static <T> BatchSpliterator batchedSpliterator(
BatchedCsvStream<String[]> csv, int batchSize, boolean isParallel, int characteristics) {
Spliterator<String[]> toWrap = csv.continuation(isParallel).spliterator();
return new BatchSpliterator(
characteristics, batchSize, toWrap.estimateSize()).setCSV(csv).setToWrap(toWrap);
}
/**
* Called internally to create the {@link BatchSpliterator} the heart and soul
* of this class.
* @param csv the Stream from which to create the spliterator
* @param batchSize the "chunk" length to be processed by each Threaded task
* @param isParallel if true, batching will take place, otherwise not
* @return
*/
private static <T> BatchSpliterator batchedSpliterator(
BatchedCsvStream<String[]> csv, int batchSize, boolean isParallel) {
Spliterator<String[]> toWrap = csv.continuation(isParallel).spliterator();
return new BatchSpliterator(
toWrap.characteristics(), batchSize, toWrap.estimateSize()).setCSV(csv).setToWrap(toWrap);
}
/**
* Factory method to create a {@code BatchedCsvStream}. If isParallel is false,
* this stream will behave like a typical stream. See also {@link BatchedCsvStream#batch(Stream, int, boolean, int, int)}
* for more fine grained setting of characteristics.
*
* @param stream JDK Stream
* @param batchSize the "chunk" length to be processed by each Threaded task
* @param isParallel if true, batching will take place, otherwise not
* @param headerLength number of header lines
* @return
*/
public static BatchedCsvStream<String[]> batch(Stream<String> stream, int batchSize, boolean isParallel, int headerLength) {
//Notice the Type of the Stream becomes String[] - This is an important optimization for
//parsing the sequence number later. (to avoid calling String.split() on each entry)
//Initializes and creates the CsvHeader here:
BatchedCsvStream<String[]> csv = new BatchedCsvStream<>(stream, headerLength);
Stream<String[]> s = !isParallel ? csv.continuation(isParallel) :
StreamSupport.stream(batchedSpliterator(csv, batchSize, isParallel), isParallel);
csv.delegate = s;
return csv;
}
/**
* Factory method to create a {@code BatchedCsvStream}.
*
* @param stream JDK Stream
* @param batchSize the "chunk" length to be processed by each Threaded task
* @param isParallel if true, batching will take place, otherwise not
* @param headerLength number of header lines
* @param characteristics stream configuration parameters (see {@link Spliterator#characteristics()})
* @return
*/
public static BatchedCsvStream<String[]> batch(Stream<String> stream, int batchSize, boolean isParallel, int headerLength, int characteristics) {
//Notice the Type of the Stream becomes String[] - This is an important optimization for
//parsing the sequence number later. (to avoid calling String.split() on each entry MULTIPLE TIMES (for the eventual sort))
//Initializes and creates the CsvHeader here:
BatchedCsvStream<String[]> csv = new BatchedCsvStream<>(stream, headerLength);
Stream<String[]> s = !isParallel ? csv.continuation(isParallel) :
StreamSupport.stream(batchedSpliterator(csv, batchSize, isParallel, characteristics), isParallel);
csv.delegate = s;
return csv;
}
/**
* Implements the {@link MetaStream} {@link FunctionalInterface} enabling
* retrieval of stream meta information.
*/
public ValueList getMeta() {
return getHeader();
}
//////////////////////////////////////////////////////////////
// Overridden Methods from Parent Class //
//////////////////////////////////////////////////////////////
@Override
public Iterator<T> iterator() {
return delegate.iterator();
}
@Override
public Spliterator<T> spliterator() {
return delegate.spliterator();
}
@Override
public boolean isParallel() {
return delegate.isParallel();
}
@Override
public Stream<T> sequential() {
return delegate.sequential();
}
@Override
public Stream<T> parallel() {
return delegate.parallel();
}
@Override
public Stream<T> unordered() {
return delegate.unordered();
}
@Override
public Stream<T> onClose(Runnable closeHandler) {
return delegate.onClose(closeHandler);
}
@Override
public void close() {
delegate.close();
}
@Override
public Stream<T> filter(Predicate<? super T> predicate) {
return delegate.filter(predicate);
}
@Override
public <R> Stream<R> map(Function<? super T, ? extends R> mapper) {
return delegate.map(mapper);
}
@Override
public IntStream mapToInt(ToIntFunction<? super T> mapper) {
return delegate.mapToInt(mapper);
}
@Override
public LongStream mapToLong(ToLongFunction<? super T> mapper) {
return delegate.mapToLong(mapper);
}
@Override
public DoubleStream mapToDouble(ToDoubleFunction<? super T> mapper) {
return delegate.mapToDouble(mapper);
}
@Override
public <R> Stream<R> flatMap(Function<? super T, ? extends Stream<? extends R>> mapper) {
return delegate.flatMap(mapper);
}
@Override
public IntStream flatMapToInt(Function<? super T, ? extends IntStream> mapper) {
return delegate.flatMapToInt(mapper);
}
@Override
public LongStream flatMapToLong(Function<? super T, ? extends LongStream> mapper) {
return delegate.flatMapToLong(mapper);
}
@Override
public DoubleStream flatMapToDouble(Function<? super T, ? extends DoubleStream> mapper) {
return delegate.flatMapToDouble(mapper);
}
@Override
public Stream<T> distinct() {
return delegate.distinct();
}
@Override
public Stream<T> sorted() {
return delegate.sorted();
}
@Override
public Stream<T> sorted(Comparator<? super T> comparator) {
return delegate.sorted(comparator);
}
@Override
public Stream<T> peek(Consumer<? super T> action) {
return delegate.peek(action);
}
@Override
public Stream<T> limit(long maxSize) {
return delegate.limit(maxSize);
}
@Override
public Stream<T> skip(long n) {
return delegate.skip(n);
}
@Override
public void forEach(Consumer<? super T> action) {
delegate.forEach(action);
}
@Override
public void forEachOrdered(Consumer<? super T> action) {
delegate.forEachOrdered(action);
}
@Override
public Object[] toArray() {
return delegate.toArray();
}
@Override
public <A> A[] toArray(IntFunction<A[]> generator) {
return delegate.toArray(generator);
}
@Override
public T reduce(T identity, BinaryOperator<T> accumulator) {
return delegate.reduce(identity, accumulator);
}
@Override
public Optional<T> reduce(BinaryOperator<T> accumulator) {
return delegate.reduce(accumulator);
}
@Override
public <U> U reduce(U identity, BiFunction<U, ? super T, U> accumulator, BinaryOperator<U> combiner) {
return delegate.reduce(identity, accumulator, combiner);
}
@Override
public <R> R collect(Supplier<R> supplier, BiConsumer<R, ? super T> accumulator, BiConsumer<R, R> combiner) {
return delegate.collect(supplier, accumulator, combiner);
}
@Override
public <R, A> R collect(Collector<? super T, A, R> collector) {
return delegate.collect(collector);
}
@Override
public Optional<T> min(Comparator<? super T> comparator) {
return delegate.min(comparator);
}
@Override
public Optional<T> max(Comparator<? super T> comparator) {
return delegate.max(comparator);
}
@Override
public long count() {
return delegate.count();
}
@Override
public boolean anyMatch(Predicate<? super T> predicate) {
return delegate.anyMatch(predicate);
}
@Override
public boolean allMatch(Predicate<? super T> predicate) {
return delegate.allMatch(predicate);
}
@Override
public boolean noneMatch(Predicate<? super T> predicate) {
return delegate.noneMatch(predicate);
}
@Override
public Optional<T> findFirst() {
return delegate.findFirst();
}
@Override
public Optional<T> findAny() {
return delegate.findAny();
}
public static void main(String[] args) {
Stream<String> stream = Stream.of(
"timestamp,consumption",
"datetime,float",
"T,",
"7/2/10 0:00,21.2",
"7/2/10 1:00,16.4",
"7/2/10 2:00,4.7",
"7/2/10 3:00,4.7",
"7/2/10 4:00,4.6",
"7/2/10 5:00,23.5",
"7/2/10 6:00,47.5",
"7/2/10 7:00,45.4",
"7/2/10 8:00,46.1",
"7/2/10 9:00,41.5",
"7/2/10 10:00,43.4",
"7/2/10 11:00,43.8",
"7/2/10 12:00,37.8",
"7/2/10 13:00,36.6",
"7/2/10 14:00,35.7",
"7/2/10 15:00,38.9",
"7/2/10 16:00,36.2",
"7/2/10 17:00,36.6",
"7/2/10 18:00,37.2",
"7/2/10 19:00,38.2",
"7/2/10 20:00,14.1");
@SuppressWarnings("resource")
BatchedCsvStream<String> csv = new BatchedCsvStream<>(stream, 3);
System.out.println("Header: " + csv.getHeader());
csv.continuation(false).forEach(l -> System.out.println("line: " + Arrays.toString(l)));
}
}