TestCsvTokenizer.java example

Explorer
EMB-master
- embulk-master
package org.embulk.standards;

import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;

import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import org.embulk.EmbulkTestRuntime;
import org.embulk.config.ConfigSource;
import org.embulk.spi.Buffer;
import org.embulk.spi.FileInput;
import org.embulk.spi.Column;
import org.embulk.spi.Schema;
import org.embulk.spi.Exec;
import org.embulk.spi.util.LineDecoder;
import org.embulk.spi.util.ListFileInput;

public class TestCsvTokenizer
{
    @Rule
    public EmbulkTestRuntime runtime = new EmbulkTestRuntime();

    protected ConfigSource config;
    protected CsvParserPlugin.PluginTask task;

    @Before
    public void setup() {
        config = Exec.newConfigSource()
            .set("newline", "LF")
            .set("columns", ImmutableList.of(
                        ImmutableMap.<String,Object>of(
                            "name", "date_code", "type", "string", "option", ImmutableMap.of()),
                        ImmutableMap.<String,Object>of(
                            "name", "foo", "type", "string", "option", ImmutableMap.of()))
                );
        reloadPluginTask();
    }

    private void reloadPluginTask()
    {
        task = config.loadConfig(CsvParserPlugin.PluginTask.class);
    }

    private static FileInput newFileInputFromLines(CsvParserPlugin.PluginTask task, String... lines)
    {
        List<Buffer> buffers = new ArrayList<>();
        for (String line : lines) {
            byte[] buffer = (line + task.getNewline().getString()).getBytes(task.getCharset());
            buffers.add(Buffer.wrap(buffer));
        }
        return new ListFileInput(ImmutableList.of(buffers));
    }

    private static FileInput newFileInputFromText(CsvParserPlugin.PluginTask task, String text)
    {
        return new ListFileInput(
                ImmutableList.of(ImmutableList.of(
                        Buffer.wrap(text.getBytes(task.getCharset())))));
    }

    private static List<List<String>> parse(CsvParserPlugin.PluginTask task, String... lines)
    {
        return parse(task, newFileInputFromLines(task, lines));
    }

    private static List<List<String>> parse(CsvParserPlugin.PluginTask task, FileInput input)
    {
        LineDecoder decoder = new LineDecoder(input, task);
        CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
        Schema schema = task.getSchemaConfig().toSchema();

        tokenizer.nextFile();

        List<List<String>> records = new ArrayList<>();
        while (tokenizer.nextRecord()) {
            List<String> record = new ArrayList<>();
            for (Column c : schema.getColumns()) {
                String v = tokenizer.nextColumnOrNull();
                record.add(v);
            }
            records.add(record);
        }
        return records;
    }

    private List<List<String>> expectedRecords(int columnCount, String... values)
    {
        List<List<String>> records = new ArrayList<>();
        List<String> columns = null;
        for (int i=0; i < values.length; i++) {
            if (i % columnCount == 0) {
                columns = new ArrayList<String>();
                records.add(columns);
            }
            columns.add(values[i]);
        }
        return records;
    }

    @Test
    public void testSimple() throws Exception
    {
        assertEquals(expectedRecords(2,
                    "aaa", "bbb",
                    "ccc", "ddd"),
                parse(task,
                    "aaa,bbb",
                    "ccc,ddd"));
    }

    @Test
    public void testSkipEmptyLine() throws Exception
    {
        assertEquals(expectedRecords(2,
                    "aaa", "bbb",
                    "ccc", "ddd"),
                parse(task,
                    "", "aaa,bbb", "", "",
                    "ccc,ddd", "", ""));
    }

    @Test
    public void parseEmptyColumnsToNull() throws Exception
    {
        assertEquals(expectedRecords(2,
                    null, null,
                    "", "",
                    "  ", "  "), // not trimmed
                parse(task,
                    ",",
                    "\"\",\"\"",
                    "  ,  "));
    }

    @Test
    public void parseEmptyColumnsToNullTrimmed() throws Exception
    {
        config.set("trim_if_not_quoted", true);
        reloadPluginTask();
        assertEquals(
                expectedRecords(2,
                    null, null,
                    "", "",
                    null, null),  // trimmed
                parse(task,
                    ",",
                    "\"\",\"\"",
                    "  ,  "));
    }

    @Test
    public void testMultilineQuotedValueWithEmptyLine() throws Exception
    {
        assertEquals(expectedRecords(2,
                    "a", "\nb\n\n",
                    "c", "d"),
                parse(task,
                    "",
                    "a,\"", "b", "", "\"",
                    "c,d"));
    }

    @Test
    public void testEndOfFileWithoutNewline() throws Exception
    {
        // In RFC 4180, the last record in the file may or may not have
        // an ending line break.
        assertEquals(expectedRecords(2,
                        "aaa", "bbb",
                        "ccc", "ddd"),
            parse(task, newFileInputFromText(task,
                    "aaa,bbb\nccc,ddd")));
    }

    @Test
    public void testChangeDelimiter() throws Exception
    {
        config.set("delimiter", JsonNodeFactory.instance.textNode("\t")); // TSV format
        reloadPluginTask();
        assertEquals(expectedRecords(2,
                        "aaa", "bbb",
                        "ccc", "ddd"),
            parse(task,
                    "aaa\tbbb",
                    "ccc\tddd"));
    }

    @Test
    public void testDefaultNullString() throws Exception
    {
        reloadPluginTask();
        assertEquals(expectedRecords(2,
                        null, "",
                        "NULL", "NULL"),
                parse(task,
                        ",\"\"",
                        "NULL,\"NULL\""));
    }

    @Test
    public void testChangeNullString() throws Exception
    {
        config.set("null_string", "NULL");
        reloadPluginTask();
        assertEquals(expectedRecords(2,
                        "", "",
                        null, null),
                parse(task,
                        ",\"\"",
                        "NULL,\"NULL\""));
    }

    @Test
    public void testQuotedValues() throws Exception
    {
        assertEquals(expectedRecords(2,
                        "a\na\na", "b,bb",
                        "cc\"c", "\"ddd",
                        null, ""),
            parse(task, newFileInputFromText(task,
                "\n\"a\na\na\",\"b,bb\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
    }

    @Test
    public void parseEscapedValues() throws Exception
    {
        assertEquals(expectedRecords(2,
                        "a\"aa", "b,bb\"",
                        "cc\"c", "\"ddd",
                        null, ""),
                parse(task, newFileInputFromText(task,
                    "\n\"a\\\"aa\",\"b,bb\\\"\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
    }

    @Test
    public void testCommentLineMarker() throws Exception
    {
        config.set("comment_line_marker", JsonNodeFactory.instance.textNode("#"));
        reloadPluginTask();
        assertEquals(expectedRecords(2,
                        "aaa", "bbb",
                        "eee", "fff"),
            parse(task,
                    "aaa,bbb",
                    "#ccc,ddd",
                    "eee,fff"));
    }

    @Test
    public void trimNonQuotedValues() throws Exception
    {
        assertEquals(expectedRecords(2,
                    "  aaa  ", "  b cd ",
                    "  ccc","dd d \n "), // quoted values are not changed
                parse(task, newFileInputFromText(task,
                        "  aaa  ,  b cd \n\"  ccc\",\"dd d \n \"")));

        // trim_if_not_quoted is true
        config.set("trim_if_not_quoted", true);
        reloadPluginTask();
        assertEquals(expectedRecords(2,
                    "aaa", "b cd",
                    "  ccc","dd d \n "), // quoted values are not changed
                parse(task, newFileInputFromText(task,
                        "  aaa  ,  b cd \n\"  ccc\",\"dd d \n \"")));
    }

    @Test
    public void parseQuotedValueWithSpacesAndTrimmingOption() throws Exception
    {
        config.set("trim_if_not_quoted", true);
        reloadPluginTask();
        assertEquals(expectedRecords(2,
                        "heading1", "heading2",
                        "trailing1","trailing2",
                        "trailing\n3","trailing\n4"),
                parse(task,
                    "  \"heading1\",  \"heading2\"",
                    "\"trailing1\"  ,\"trailing2\"  ",
                    "\"trailing\n3\"  ,\"trailing\n4\"  "));
    }

    @Test
    public void throwQuotedSizeLimitExceededException() throws Exception
    {
        config.set("max_quoted_size_limit", 8);
        reloadPluginTask();

        try {
            parse(task,
                    "v1,v2",
                    "v3,\"0123456789\"");
            fail();
        } catch (Exception e) {
            assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
        }

        // multi-line
        try {
            parse(task,
                    "v1,v2",
                    "\"012345\n6789\",v3");
            fail();
        } catch (Exception e) {
            assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
        }
    }

    @Test
    public void recoverFromQuotedSizeLimitExceededException() throws Exception
    {
        config.set("max_quoted_size_limit", 12);
        reloadPluginTask();

        String[] lines = new String[] {
            "v1,v2",
            "v3,\"0123",  // this is a broken line and should be skipped
            "v4,v5",      // this line should be not be skiped
            "v6,v7",      // this line should be not be skiped
        };
        FileInput input = newFileInputFromLines(task, lines);
        LineDecoder decoder = new LineDecoder(input, task);
        CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
        Schema schema = task.getSchemaConfig().toSchema();

        tokenizer.nextFile();

        assertTrue(tokenizer.nextRecord());
        assertEquals("v1", tokenizer.nextColumn());
        assertEquals("v2", tokenizer.nextColumn());

        assertTrue(tokenizer.nextRecord());
        assertEquals("v3", tokenizer.nextColumn());
        try {
            tokenizer.nextColumn();
            fail();
        } catch (Exception e) {
            assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
        }
        assertEquals("v3,\"0123", tokenizer.skipCurrentLine());

        assertTrue(tokenizer.nextRecord());
        assertEquals("v4", tokenizer.nextColumn());
        assertEquals("v5", tokenizer.nextColumn());

        assertTrue(tokenizer.nextRecord());
        assertEquals("v6", tokenizer.nextColumn());
        assertEquals("v7", tokenizer.nextColumn());
    }

    /*
    @Test
    public void parseEscapedQuotedValues() throws Exception
    {
        List<List<String>> parsed = doParse(task, bufferList("utf-8",
                "\"aa,a\",\",aaa\",\"aaa,\"", "\n",
                "\"bb\"\"b\",\"\"\"bbb\",\"bbb\"\"\"", "\n",
                "\"cc\\\"c\",\"\\\"ccc\",\"ccc\\\"\"", "\n",
                "\"dd\nd\",\"\nddd\",\"ddd\n\"", "\n"));
        assertEquals(Arrays.asList(
                        Arrays.asList("aa,a", ",aaa", "aaa,"),
                        Arrays.asList("bb\"b", "\"bbb", "bbb\""),
                        Arrays.asList("cc\"c", "\"ccc", "ccc\""),
                        Arrays.asList("dd\nd", "\nddd", "ddd\n")),
                parsed);
    }
    */
}