/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.loader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.diqube.data.column.ColumnPage;
import org.diqube.data.column.ColumnType;
import org.diqube.data.column.StandardColumnShard;
import org.diqube.data.table.TableShard;
import org.diqube.data.types.dbl.dict.DoubleDictionary;
import org.diqube.data.types.lng.dict.LongDictionary;
import org.diqube.data.types.str.dict.StringDictionary;
import org.diqube.loader.columnshard.ColumnShardBuilder;
import org.diqube.util.BigByteBuffer;
import org.diqube.util.IoUtils;
import org.diqube.util.PrimitiveUtils;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.google.common.collect.Iterables;
/**
* Tests the {@link CsvLoader}.
*
* @author Bastian Gloeckle
*/
public class CsvLoaderTest {
private static final String CSV_SIMPLE_CLASSPATH = "/CsvLoaderTestSimple.csv";
private static final String CSV_COL_A = "colA";
private static final String CSV_COL_B = "colB";
private static final String CSV_COL_C = "colC";
private CsvLoader csvLoader;
private LoaderColumnInfo colInfo;
private AnnotationConfigApplicationContext dataContext;
@BeforeMethod
public void setUp() {
dataContext = new AnnotationConfigApplicationContext();
dataContext.scan("org.diqube");
dataContext.refresh();
csvLoader = dataContext.getBean(CsvLoader.class);
colInfo = new LoaderColumnInfo(ColumnType.STRING);
}
@AfterMethod
public void shutDown() {
dataContext.close();
}
@Test
public void smallSimpleCsv() throws LoadException {
// GIVEN
// simple CSV with 3 columns, one mapped to String, one Long and one Double.
BigByteBuffer buf = IoUtils.inputStreamToBigByteBuffer(getClass().getResourceAsStream(CSV_SIMPLE_CLASSPATH));
colInfo.registerColumnType(CSV_COL_B, ColumnType.LONG);
colInfo.registerColumnType(CSV_COL_C, ColumnType.DOUBLE);
// WHEN
// loading table
TableShard table = Iterables.getOnlyElement(csvLoader.load(0L, buf, "Test", colInfo));
// THEN
Assert.assertEquals(table.getNumberOfRowsInShard(), 4, "Expected 4 rows");
Assert.assertNotNull(table.getStringColumns().get(CSV_COL_A), "Expected String column");
Assert.assertNotNull(table.getLongColumns().get(CSV_COL_B), "Expected Long column");
Assert.assertNotNull(table.getDoubleColumns().get(CSV_COL_C), "Expected Double column");
@SuppressWarnings("unchecked")
List<String> colAValues = resolveAllValues(table.getStringColumns().get(CSV_COL_A));
@SuppressWarnings("unchecked")
List<Long> colBValues = resolveAllValues(table.getLongColumns().get(CSV_COL_B));
@SuppressWarnings("unchecked")
List<Double> colCValues = resolveAllValues(table.getDoubleColumns().get(CSV_COL_C));
Assert.assertEquals(new HashSet<String>(colAValues).size(), colAValues.size(),
"Duplicate values in column not expected");
Assert.assertEquals(new HashSet<Long>(colBValues).size(), colBValues.size(),
"Duplicate values in column not expected");
Assert.assertEquals(new HashSet<Double>(colCValues).size(), colCValues.size(),
"Duplicate values in column not expected");
Assert.assertEquals(new HashSet<String>(colAValues),
new HashSet<String>(Arrays.asList(new String[] { "1", "2", "3", "4" })),
"Different values expected (inspect data type!)");
Assert.assertEquals(new HashSet<Long>(colBValues), new HashSet<Long>(Arrays.asList(new Long[] { 1L, 2L, 3L, 4L })),
"Different values expected (inspect data type!)");
Assert.assertEquals(new HashSet<Double>(colCValues),
new HashSet<Double>(Arrays.asList(new Double[] { 1., 2., 3., 4. })),
"Different values expected (inspect data type!)");
}
@Test
public void longCsvTestProposalRows() throws LoadException {
// GIVEN
// a CSV with PROPOSAL_ROWS rows with numbers 0..PROPOSAL_ROWS-1
int rows = ColumnShardBuilder.PROPOSAL_ROWS;
BigByteBuffer buf = generateCsvOneColumn(CSV_COL_A, rows);
colInfo.registerColumnType(CSV_COL_A, ColumnType.LONG);
// WHEN
// parsing this
TableShard shard = Iterables.getOnlyElement(csvLoader.load(0L, buf, "Test", colInfo));
// THEN
Assert.assertEquals(shard.getNumberOfRowsInShard(), rows, "Expected " + rows + " rows");
@SuppressWarnings("unchecked")
List<Long> colAValues = resolveAllValues(shard.getLongColumns().get(CSV_COL_A));
Assert.assertEquals(new HashSet<>(colAValues), generateLongSetRange(0, rows), "Expected correct values");
Assert.assertEquals(shard.getLongColumns().get(CSV_COL_A).getPages().size(), 1, "Only one ColumnPage expected");
}
@Test
public void longCsvTestProposalRowsPlusOne() throws LoadException {
// GIVEN
// a CSV with PROPOSAL_ROWS+1 rows with numbers 0..PROPOSAL_ROWS
int rows = ColumnShardBuilder.PROPOSAL_ROWS + 1;
BigByteBuffer buf = generateCsvOneColumn(CSV_COL_A, rows);
colInfo.registerColumnType(CSV_COL_A, ColumnType.LONG);
// WHEN
// parsing this
TableShard shard = Iterables.getOnlyElement(csvLoader.load(0L, buf, "Test", colInfo));
// THEN
Assert.assertEquals(shard.getNumberOfRowsInShard(), rows, "Expected " + rows + " rows");
@SuppressWarnings("unchecked")
List<Long> colAValues = resolveAllValues(shard.getLongColumns().get(CSV_COL_A));
Assert.assertEquals(new HashSet<>(colAValues), generateLongSetRange(0, rows), "Expected correct values");
Assert.assertEquals(shard.getLongColumns().get(CSV_COL_A).getPages().size(), 2, "Two ColumnPages expected");
}
@Test(expectedExceptions = LoadException.class)
public void unparsableCsv() throws LoadException {
// GIVEN
String csv = //
CSV_COL_A + "," + CSV_COL_B + "\n" + //
"1," + Long.MAX_VALUE + "9\n";
colInfo.registerColumnType(CSV_COL_A, ColumnType.LONG);
colInfo.registerColumnType(CSV_COL_B, ColumnType.LONG);
// WHEN
csvLoader.load(0L, new BigByteBuffer(csv.getBytes()), "Test", colInfo);
// THEN: exception
}
/**
* Generates a {@link BigByteBuffer} containing a specific amount of CSV rows, each row containing one of the numbers
* 0..rows-1, each number is exactly once in the CSV.
*/
private static BigByteBuffer generateCsvOneColumn(String colName, int rows) {
StringBuffer sb = new StringBuffer(colName);
sb.append('\n');
for (int i = 0; i < rows; i++) {
sb.append(Integer.toString(i));
sb.append('\n');
}
return new BigByteBuffer(sb.toString().getBytes());
}
private static Set<Long> generateLongSetRange(long fromIncluded, long toExcluded) {
Set<Long> res = new HashSet<>();
for (long l = fromIncluded; l < toExcluded; l++)
res.add(l);
return res;
}
/**
* Return decompressed values of all rows in given column.
*
* @return List of either String, Long or Double, according to col.getColumnType().
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
private static List resolveAllValues(StandardColumnShard col) {
List res = new ArrayList();
for (ColumnPage page : ((Map<Long, ColumnPage>) col.getPages()).values()) {
// find column value IDs from column page value Ids
List<Long> cvIds = Arrays.asList(PrimitiveUtils.toBoxedArray(page.getValues().decompressedArray())).stream()
.map(cpvId -> page.getColumnPageDict().decompressValue(cpvId)).collect(Collectors.toList());
switch (col.getColumnType()) {
case STRING:
res.addAll(cvIds.stream().map(cvId -> ((StringDictionary) col.getColumnShardDictionary()).decompressValue(cvId))
.collect(Collectors.toList()));
break;
case LONG:
res.addAll(cvIds.stream().map(cvId -> ((LongDictionary) col.getColumnShardDictionary()).decompressValue(cvId))
.collect(Collectors.toList()));
break;
case DOUBLE:
res.addAll(cvIds.stream().map(cvId -> ((DoubleDictionary) col.getColumnShardDictionary()).decompressValue(cvId))
.collect(Collectors.toList()));
break;
}
}
return res;
}
}