package org.deeplearning4j.spark.datavec;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.input.PortableDataStream;
import org.datavec.api.io.labels.ParentPathLabelGenerator;
import org.datavec.api.records.reader.SequenceRecordReader;
import org.datavec.api.records.reader.impl.csv.CSVRecordReader;
import org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader;
import org.datavec.api.split.FileSplit;
import org.datavec.api.split.InputSplit;
import org.datavec.api.split.NumberedFileInputSplit;
import org.datavec.api.writable.Writable;
import org.datavec.image.recordreader.ImageRecordReader;
import org.datavec.spark.functions.SequenceRecordReaderFunction;
import org.datavec.spark.functions.pairdata.*;
import org.datavec.spark.transform.misc.StringToWritablesFunction;
import org.datavec.spark.util.DataVecSparkUtil;
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator;
import org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator;
import org.deeplearning4j.spark.BaseSparkTest;
import org.junit.Test;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.DataSet;
import scala.Tuple2;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.*;
public class TestDataVecDataSetFunctions extends BaseSparkTest {
@Test
public void testDataVecDataSetFunction() throws Exception {
JavaSparkContext sc = getContext();
//Test Spark record reader functionality vs. local
File f = new File("src/test/resources/imagetest/0/a.bmp");
List<String> labelsList = Arrays.asList("0", "1"); //Need this for Spark: can't infer without init call
String path = f.getPath();
String folder = path.substring(0, path.length() - 7);
path = folder + "*";
JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
assertEquals(4, origData.count()); //4 images
ImageRecordReader rr = new ImageRecordReader(28, 28, 1, new ParentPathLabelGenerator());
rr.setLabels(labelsList);
org.datavec.spark.functions.RecordReaderFunction rrf = new org.datavec.spark.functions.RecordReaderFunction(rr);
JavaRDD<List<Writable>> rdd = origData.map(rrf);
JavaRDD<DataSet> data = rdd.map(new DataVecDataSetFunction(1, 2, false));
List<DataSet> collected = data.collect();
//Load normally (i.e., not via Spark), and check that we get the same results (order not withstanding)
InputSplit is = new FileSplit(new File(folder), new String[] {"bmp"}, true);
ImageRecordReader irr = new ImageRecordReader(28, 28, 1, new ParentPathLabelGenerator());
irr.initialize(is);
RecordReaderDataSetIterator iter = new RecordReaderDataSetIterator(irr, 1, 1, 2);
List<DataSet> listLocal = new ArrayList<>(4);
while (iter.hasNext()) {
listLocal.add(iter.next());
}
//Compare:
assertEquals(4, collected.size());
assertEquals(4, listLocal.size());
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[4];
for (int i = 0; i < 4; i++) {
int foundIndex = -1;
DataSet ds = collected.get(i);
for (int j = 0; j < 4; j++) {
if (ds.equals(listLocal.get(j))) {
if (foundIndex != -1)
fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
found[foundIndex] = true; //mark this one as seen before
}
}
}
int count = 0;
for (boolean b : found)
if (b)
count++;
assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}
@Test
public void testDataVecDataSetFunctionMultiLabelRegression() throws Exception {
JavaSparkContext sc = getContext();
List<String> stringData = new ArrayList<>();
int n = 6;
for (int i = 0; i < 10; i++) {
StringBuilder sb = new StringBuilder();
boolean first = true;
for (int j = 0; j < n; j++) {
if (!first)
sb.append(",");
sb.append(10 * i + j);
first = false;
}
stringData.add(sb.toString());
}
JavaRDD<String> stringList = sc.parallelize(stringData);
JavaRDD<List<Writable>> writables = stringList.map(new StringToWritablesFunction(new CSVRecordReader()));
JavaRDD<DataSet> dataSets = writables.map(new DataVecDataSetFunction(3, 5, -1, true, null, null));
List<DataSet> ds = dataSets.collect();
assertEquals(10, ds.size());
boolean[] seen = new boolean[10];
for (DataSet d : ds) {
INDArray f = d.getFeatureMatrix();
INDArray l = d.getLabels();
assertEquals(3, f.length());
assertEquals(3, l.length());
int exampleIdx = ((int) f.getDouble(0)) / 10;
seen[exampleIdx] = true;
for (int j = 0; j < 3; j++) {
assertEquals(10 * exampleIdx + j, (int) f.getDouble(j));
assertEquals(10 * exampleIdx + j + 3, (int) l.getDouble(j));
}
}
int seenCount = 0;
for (boolean b : seen)
if (b)
seenCount++;
assertEquals(10, seenCount);
}
@Test
public void testDataVecSequenceDataSetFunction() throws Exception {
JavaSparkContext sc = getContext();
//Test Spark record reader functionality vs. local
File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
String path = f.getPath();
String folder = path.substring(0, path.length() - 17);
path = folder + "*";
JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
assertEquals(3, origData.count()); //3 CSV sequences
SequenceRecordReader seqRR = new CSVSequenceRecordReader(1, ",");
SequenceRecordReaderFunction rrf = new SequenceRecordReaderFunction(seqRR);
JavaRDD<List<List<Writable>>> rdd = origData.map(rrf);
JavaRDD<DataSet> data = rdd.map(new DataVecSequenceDataSetFunction(2, -1, true, null, null));
List<DataSet> collected = data.collect();
//Load normally (i.e., not via Spark), and check that we get the same results (order not withstanding)
InputSplit is = new FileSplit(new File(folder), new String[] {"txt"}, true);
SequenceRecordReader seqRR2 = new CSVSequenceRecordReader(1, ",");
seqRR2.initialize(is);
SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(seqRR2, 1, -1, 2, true);
List<DataSet> listLocal = new ArrayList<>(3);
while (iter.hasNext()) {
listLocal.add(iter.next());
}
//Compare:
assertEquals(3, collected.size());
assertEquals(3, listLocal.size());
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = collected.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(listLocal.get(j))) {
if (foundIndex != -1)
fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
found[foundIndex] = true; //mark this one as seen before
}
}
}
int count = 0;
for (boolean b : found)
if (b)
count++;
assertEquals(3, count); //Expect all 3 and exactly 3 pairwise matches between spark and local versions
}
@Test
public void testDataVecSequencePairDataSetFunction() throws Exception {
JavaSparkContext sc = getContext();
//Convert data to a SequenceFile:
File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
String path = f.getPath();
String folder = path.substring(0, path.length() - 17);
path = folder + "*";
PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
JavaPairRDD<Text, BytesPairWritable> toWrite =
DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);
Path p = Files.createTempDirectory("dl4j_testSeqPairFn");
p.toFile().deleteOnExit();
String outPath = p.toString() + "/out";
new File(outPath).deleteOnExit();
toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
//Load from sequence file:
JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);
SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
//Map to DataSet:
DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction();
JavaRDD<DataSet> data = writables.map(pairFn);
List<DataSet> sparkData = data.collect();
//Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
String featuresPath = f.getAbsolutePath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
SequenceRecordReaderDataSetIterator iter =
new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, -1, true);
List<DataSet> localData = new ArrayList<>(3);
while (iter.hasNext())
localData.add(iter.next());
assertEquals(3, sparkData.size());
assertEquals(3, localData.size());
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkData.get(i);
DataSet dsLocal = localData.get(i);
assertNull(dsSpark.getFeaturesMaskArray());
assertNull(dsSpark.getLabelsMaskArray());
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
int[] s = new int[] {1, 3, 4}; //1 example, 3 values, 3 time steps
assertArrayEquals(s, fSpark.shape());
assertArrayEquals(s, fLocal.shape());
assertArrayEquals(s, lSpark.shape());
assertArrayEquals(s, lLocal.shape());
}
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
found[foundIndex] = true; //mark this one as seen before
}
}
}
int count = 0;
for (boolean b : found)
if (b)
count++;
assertEquals(3, count); //Expect all 3 and exactly 3 pairwise matches between spark and local versions
}
@Test
public void testDataVecSequencePairDataSetFunctionVariableLength() throws Exception {
//Same sort of test as testDataVecSequencePairDataSetFunction() but with variable length time series (labels shorter, align end)
//Convert data to a SequenceFile:
File f = new File("src/test/resources/csvsequence/csvsequence_0.txt");
String pathFeatures = f.getAbsolutePath();
String folderFeatures = pathFeatures.substring(0, pathFeatures.length() - 17);
pathFeatures = folderFeatures + "*";
File f2 = new File("src/test/resources/csvsequencelabels/csvsequencelabelsShort_0.txt");
String pathLabels = f2.getPath();
String folderLabels = pathLabels.substring(0, pathLabels.length() - 28);
pathLabels = folderLabels + "*";
PathToKeyConverter pathConverter = new PathToKeyConverterNumber(); //Extract a number from the file name
JavaPairRDD<Text, BytesPairWritable> toWrite =
DataVecSparkUtil.combineFilesForSequenceFile(sc, pathFeatures, pathLabels, pathConverter);
Path p = Files.createTempDirectory("dl4j_testSeqPairFnVarLength");
p.toFile().deleteOnExit();
String outPath = p.toString() + "/out";
new File(outPath).deleteOnExit();
toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
//Load from sequence file:
JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);
SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
//Map to DataSet:
DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction(4, false,
DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_END);
JavaRDD<DataSet> data = writables.map(pairFn);
List<DataSet> sparkData = data.collect();
//Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
String featuresPath = f.getPath().replaceAll("0", "%d");
String labelsPath = f2.getPath().replaceAll("0", "%d");
SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iter = new SequenceRecordReaderDataSetIterator(featureReader, labelReader,
1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_END);
List<DataSet> localData = new ArrayList<>(3);
while (iter.hasNext())
localData.add(iter.next());
assertEquals(3, sparkData.size());
assertEquals(3, localData.size());
int[] fShapeExp = new int[] {1, 3, 4}; //1 example, 3 values, 4 time steps
int[] lShapeExp = new int[] {1, 4, 4}; //1 example, 4 values/classes, 4 time steps (after padding)
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkData.get(i);
DataSet dsLocal = localData.get(i);
assertNotNull(dsSpark.getLabelsMaskArray()); //Expect mask array for labels
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
assertArrayEquals(fShapeExp, fSpark.shape());
assertArrayEquals(fShapeExp, fLocal.shape());
assertArrayEquals(lShapeExp, lSpark.shape());
assertArrayEquals(lShapeExp, lLocal.shape());
}
//Check that results are the same (order not withstanding)
boolean[] found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
found[foundIndex] = true; //mark this one as seen before
}
}
}
int count = 0;
for (boolean b : found)
if (b)
count++;
assertEquals(3, count); //Expect all 3 and exactly 3 pairwise matches between spark and local versions
//-------------------------------------------------
//NOW: test same thing, but for align start...
DataVecSequencePairDataSetFunction pairFnAlignStart = new DataVecSequencePairDataSetFunction(4, false,
DataVecSequencePairDataSetFunction.AlignmentMode.ALIGN_START);
JavaRDD<DataSet> rddDataAlignStart = writables.map(pairFnAlignStart);
List<DataSet> sparkDataAlignStart = rddDataAlignStart.collect();
featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2)); //re-initialize to reset
labelReader.initialize(new NumberedFileInputSplit(labelsPath, 0, 2));
SequenceRecordReaderDataSetIterator iterAlignStart = new SequenceRecordReaderDataSetIterator(featureReader,
labelReader, 1, 4, false, SequenceRecordReaderDataSetIterator.AlignmentMode.ALIGN_START);
List<DataSet> localDataAlignStart = new ArrayList<>(3);
while (iterAlignStart.hasNext())
localDataAlignStart.add(iterAlignStart.next());
assertEquals(3, sparkDataAlignStart.size());
assertEquals(3, localDataAlignStart.size());
for (int i = 0; i < 3; i++) {
//Check shapes etc. data sets order may differ for spark vs. local
DataSet dsSpark = sparkDataAlignStart.get(i);
DataSet dsLocal = localDataAlignStart.get(i);
assertNotNull(dsSpark.getLabelsMaskArray()); //Expect mask array for labels
INDArray fSpark = dsSpark.getFeatureMatrix();
INDArray fLocal = dsLocal.getFeatureMatrix();
INDArray lSpark = dsSpark.getLabels();
INDArray lLocal = dsLocal.getLabels();
assertArrayEquals(fShapeExp, fSpark.shape());
assertArrayEquals(fShapeExp, fLocal.shape());
assertArrayEquals(lShapeExp, lSpark.shape());
assertArrayEquals(lShapeExp, lLocal.shape());
}
//Check that results are the same (order not withstanding)
found = new boolean[3];
for (int i = 0; i < 3; i++) {
int foundIndex = -1;
DataSet ds = sparkData.get(i);
for (int j = 0; j < 3; j++) {
if (ds.equals(localData.get(j))) {
if (foundIndex != -1)
fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
foundIndex = j;
if (found[foundIndex])
fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
found[foundIndex] = true; //mark this one as seen before
}
}
}
count = 0;
for (boolean b : found)
if (b)
count++;
assertEquals(3, count); //Expect all 3 and exactly 3 pairwise matches between spark and local versions
}
}