package edu.washington.escience.myria.operator; import static org.junit.Assert.assertEquals; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.PrintStream; import java.nio.file.Paths; import org.junit.Test; import com.google.common.collect.ImmutableList; import edu.washington.escience.myria.CsvTupleReader; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.io.ByteArraySource; import edu.washington.escience.myria.io.FileSource; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.TestEnvVars; public class CsvTupleReaderTest { /** * Helper function used to run tests. * * @param filename the file in which the relation is stored. * @param schema the schema of the relation in the file. * @return the number of rows in the file. * @throws DbException if the file does not match the given Schema. * @throws InterruptedException */ private static int getRowCount(final String filename, final Schema schema) throws DbException, InterruptedException { return getRowCount(filename, schema, null); } /** * Helper function used to run tests. * * @param filename the file in which the relation is stored. * @param schema the schema of the relation in the file. * @param delimiter if non-null, an override file delimiter * @return the number of rows in the file. * @throws DbException if the file does not match the given Schema. * @throws FileNotFoundException if the specified file does not exist. * @throws InterruptedException */ private static int getRowCount( final String filename, final Schema schema, final Character delimiter) throws DbException, InterruptedException { return getRowCount(filename, schema, delimiter, null, null); } /** * @param filename the file in which the relation is stored. * @param schema the schema of the relation in the file. * @param delimiter if non-null, an override file delimiter * @param quote the user specified quotation mark * @return the number of rows in the file. * @throws DbException if the file does not match the given Schema. * @throws FileNotFoundException if the specified file does not exist. * @throws InterruptedException */ private static int getRowCount( final String filename, final Schema schema, final Character delimiter, final Character quote) throws DbException, InterruptedException { return getRowCount(filename, schema, delimiter, quote, null); } /** * @param filename the file in which the relation is stored. * @param schema the schema of the relation in the file. * @param delimiter if non-null, an override file delimiter * @param quote the user specified quotation mark * @return the number of rows in the file. * @throws DbException if the file does not match the given Schema. * @throws FileNotFoundException if the specified file does not exist. * @throws InterruptedException */ private static int getRowCount( final String filename, final Schema schema, final Character delimiter, final Character quote, final Character escape) throws DbException, InterruptedException { final String realFilename = Paths.get("testdata", "filescan", filename).toString(); TupleSource dataInput = new TupleSource( new CsvTupleReader(schema, delimiter, quote, escape, null), new FileSource(realFilename)); return getRowCount(dataInput); } /** * Helper function used to run tests. * * @param fileScan the FileScan object to be tested. * @return the number of rows in the file. * @throws DbException if the file does not match the given Schema. * @throws InterruptedException */ private static int getRowCount(final TupleSource dataInput) throws DbException, InterruptedException { dataInput.open(TestEnvVars.get()); int count = 0; TupleBatch tb = null; while (!dataInput.eos()) { tb = dataInput.nextReady(); if (tb != null) { count += tb.numTuples(); } } return count; } @Test public void testSimpleCsvEscape() throws DbException, InterruptedException { final String filename = "two_col_string.txt"; final Schema schema = new Schema(ImmutableList.of(Type.STRING_TYPE, Type.STRING_TYPE)); assertEquals(7, getRowCount(filename, schema)); } @Test public void testSimpleCsvQuoted() throws DbException, InterruptedException { final String filename = "two_col_string_quoted.txt"; final Schema schema = new Schema(ImmutableList.of(Type.STRING_TYPE, Type.STRING_TYPE)); assertEquals(7, getRowCount(filename, schema)); } @Test public void testSimpleCsvSingleQuoted() throws DbException, InterruptedException { final String filename = "two_col_string_single_quoted.txt"; final Schema schema = new Schema(ImmutableList.of(Type.STRING_TYPE, Type.STRING_TYPE)); assertEquals(7, getRowCount(filename, schema, null, '\'')); } @Test public void testSimpleCsvEscaped() throws DbException, InterruptedException { final String filename = "two_col_string_escaped.txt"; final Schema schema = new Schema(ImmutableList.of(Type.STRING_TYPE, Type.STRING_TYPE)); assertEquals(7, getRowCount(filename, schema, null, null, '\\')); } @Test(expected = DbException.class) public void testBadCommaTwoColumnInt() throws DbException, InterruptedException { final String filename = "bad_comma_two_col_int_unix.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); assertEquals(7, getRowCount(filename, schema)); } @Test(expected = DbException.class) public void testBadTwoColumnInt() throws DbException, InterruptedException { final String filename = "bad_two_col_int.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); assertEquals(7, getRowCount(filename, schema)); } @Test(expected = DbException.class) public void testBadTwoColumnInt2() throws DbException, InterruptedException { final String filename = "bad_two_col_int_2.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); assertEquals(7, getRowCount(filename, schema)); } @Test(expected = DbException.class) public void testBadTwoColumnInt3() throws DbException, InterruptedException { final String filename = "bad_two_col_int_3.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); assertEquals(7, getRowCount(filename, schema)); } @Test public void testCommaTwoColumnIntUnix() throws DbException, InterruptedException { final String filename = "comma_two_col_int_unix.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); assertEquals(7, getRowCount(filename, schema, ',')); } @Test public void testCommaTwoColumnIntUnixNoTrailingNewline() throws DbException, InterruptedException { final String filename = "comma_two_col_int_unix_no_trailing_newline.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); assertEquals(7, getRowCount(filename, schema, ',')); } @Test public void testCommaTwoColumnIntDos() throws DbException, InterruptedException { final String filename = "comma_two_col_int_dos.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); int countRow = getRowCount(filename, schema, ','); assertEquals(7, countRow); } @Test public void testSimpleTwoColumnInt() throws DbException, InterruptedException { final String filename = "simple_two_col_int.txt"; final Schema schema = new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE)); assertEquals(7, getRowCount(filename, schema, ' ')); } @Test public void testSimpleTwoColumnFloat() throws Exception { final String filename = "simple_two_col_float.txt"; final Schema schema = new Schema(ImmutableList.of(Type.FLOAT_TYPE, Type.FLOAT_TYPE)); assertEquals(7, getRowCount(filename, schema, ' ')); } @Test public void testRandomCSV() throws Exception { // file generated using: // python testdata/generated/generate_csv.py 10000 --delimiter ' ' int int float str > testdata/filescan/random.csv final String filename = "random.csv"; final Schema schema = new Schema( ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE, Type.FLOAT_TYPE, Type.STRING_TYPE)); assertEquals(10000, getRowCount(filename, schema, ' ')); } @Test public void testBigFile() throws Exception { ByteArrayOutputStream bytes = new ByteArrayOutputStream(); PrintStream printedBytes = new PrintStream(bytes); /* Print 2*TupleBatch.BATCH_SIZE lines */ for (int i = 0; i < TupleUtils.getBatchSize(Type.INT_TYPE) * 2; ++i) { printedBytes.print(i); printedBytes.print('\n'); } printedBytes.flush(); TupleSource scanBytes = new TupleSource( new CsvTupleReader( Schema.of(ImmutableList.of(Type.INT_TYPE), ImmutableList.of("col1"))), new ByteArraySource(bytes.toByteArray())); assertEquals(2 * TupleUtils.getBatchSize(Type.INT_TYPE), getRowCount(scanBytes)); } @Test public void testPipeDelimiter() throws Exception { final String filename = "nccdc_100.txt"; final Schema schema = new Schema( ImmutableList.of( Type.STRING_TYPE, Type.STRING_TYPE, Type.INT_TYPE, Type.INT_TYPE, Type.INT_TYPE, Type.INT_TYPE, Type.INT_TYPE)); assertEquals(100, getRowCount(filename, schema, '|')); } }