package org.embulk.spi.util;
import java.util.List;
import java.util.ArrayList;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import com.google.common.collect.ImmutableList;
import org.junit.Rule;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import org.embulk.config.ConfigSource;
import org.embulk.spi.Exec;
import org.embulk.spi.Buffer;
import org.embulk.spi.util.ListFileInput;
import org.embulk.EmbulkTestRuntime;
public class TestLineDecoder
{
@Rule
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
@Test
public void testDefaultValues()
{
ConfigSource config = Exec.newConfigSource();
LineDecoder.DecoderTask task = config.loadConfig(LineDecoder.DecoderTask.class);
assertEquals(StandardCharsets.UTF_8, task.getCharset());
assertEquals(Newline.CRLF, task.getNewline());
}
@Test
public void testLoadConfig()
{
ConfigSource config = Exec.newConfigSource()
.set("charset", "utf-16")
.set("newline", "CRLF");
LineDecoder.DecoderTask task = config.loadConfig(LineDecoder.DecoderTask.class);
assertEquals(StandardCharsets.UTF_16, task.getCharset());
assertEquals(Newline.CRLF, task.getNewline());
}
private static LineDecoder.DecoderTask getExampleConfig(Charset charset, Newline newline)
{
ConfigSource config = Exec.newConfigSource()
.set("charset", charset)
.set("newline", newline);
return config.loadConfig(LineDecoder.DecoderTask.class);
}
private static LineDecoder newDecoder(Charset charset, Newline newline, List<Buffer> buffers)
{
ListFileInput input = new ListFileInput(ImmutableList.of(buffers));
return new LineDecoder(input, getExampleConfig(charset, newline));
}
private static List<String> doDecode(Charset charset, Newline newline, List<Buffer> buffers)
{
ImmutableList.Builder<String> builder = ImmutableList.builder();
LineDecoder decoder = newDecoder(charset, newline, buffers);
decoder.nextFile();
while (true) {
String line = decoder.poll();
if (line == null) {
break;
}
builder.add(line);
}
return builder.build();
}
private static List<Buffer> bufferList(Charset charset, String... sources) throws UnsupportedCharsetException
{
List<Buffer> buffers = new ArrayList<Buffer>();
for (String source : sources) {
ByteBuffer buffer = charset.encode(source);
buffers.add(Buffer.wrap(buffer.array(), 0, buffer.limit()));
}
return buffers;
}
@Test
public void testDecodeBasicAscii() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.LF,
bufferList(StandardCharsets.UTF_8, "test1\ntest2\ntest3\n"));
assertEquals(ImmutableList.of("test1", "test2", "test3"), decoded);
}
@Test
public void testDecodeBasicAsciiCRLF() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.CRLF,
bufferList(StandardCharsets.UTF_8, "test1\r\ntest2\r\ntest3\r\n"));
assertEquals(ImmutableList.of("test1", "test2", "test3"), decoded);
}
@Test
public void testDecodeBasicAsciiTail() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.LF,
bufferList(StandardCharsets.UTF_8, "test1"));
assertEquals(ImmutableList.of("test1"), decoded);
}
@Test
public void testDecodeChunksLF() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.LF,
bufferList(StandardCharsets.UTF_8, "t", "1", "\n", "t", "2"));
assertEquals(ImmutableList.of("t1", "t2"), decoded);
}
@Test
public void testDecodeChunksCRLF() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.CRLF,
bufferList(StandardCharsets.UTF_8, "t", "1", "\r\n", "t", "2", "\r", "\n", "t3"));
assertEquals(ImmutableList.of("t1", "t2", "t3"), decoded);
}
@Test
public void testDecodeBasicUTF8() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.LF,
bufferList(StandardCharsets.UTF_8, "てすと1\nテスト2\nてすと3\n"));
assertEquals(ImmutableList.of("てすと1", "テスト2", "てすと3"), decoded);
}
@Test
public void testDecodeBasicUTF8Tail() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.LF,
bufferList(StandardCharsets.UTF_8, "てすと1"));
assertEquals(ImmutableList.of("てすと1"), decoded);
}
@Test
public void testDecodeChunksUTF8LF() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.LF,
bufferList(StandardCharsets.UTF_8, "て", "1", "\n", "す", "2"));
assertEquals(ImmutableList.of("て1", "す2"), decoded);
}
@Test
public void testDecodeChunksUTF8CRLF() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_8, Newline.CRLF,
bufferList(StandardCharsets.UTF_8, "て", "1", "\r\n", "す", "2", "\r", "\n", "と3"));
assertEquals(ImmutableList.of("て1", "す2", "と3"), decoded);
}
@Test
public void testDecodeBasicUTF16LE() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_16LE, Newline.LF,
bufferList(StandardCharsets.UTF_16LE, "てすと1\nテスト2\nてすと3\n"));
assertEquals(ImmutableList.of("てすと1", "テスト2", "てすと3"), decoded);
}
@Test
public void testDecodeBasicUTF16LETail() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_16LE, Newline.LF,
bufferList(StandardCharsets.UTF_16LE, "てすと1"));
assertEquals(ImmutableList.of("てすと1"), decoded);
}
@Test
public void testDecodeChunksUTF16LELF() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_16LE, Newline.LF,
bufferList(StandardCharsets.UTF_16LE, "て", "1", "\n", "す", "2"));
assertEquals(ImmutableList.of("て1", "す2"), decoded);
}
@Test
public void testDecodeChunksUTF16LECRLF() throws Exception
{
List<String> decoded = doDecode(
StandardCharsets.UTF_16LE, Newline.CRLF,
bufferList(StandardCharsets.UTF_16LE, "て", "1", "\r\n", "す", "2", "\r", "\n", "と3"));
assertEquals(ImmutableList.of("て1", "す2", "と3"), decoded);
}
@Test
public void testDecodeBasicMS932() throws Exception
{
List<String> decoded = doDecode(
Charset.forName("ms932"), Newline.LF,
bufferList(Charset.forName("ms932"), "てすと1\nテスト2\nてすと3\n"));
assertEquals(ImmutableList.of("てすと1", "テスト2", "てすと3"), decoded);
}
@Test
public void testDecodeBasicMS932Tail() throws Exception
{
List<String> decoded = doDecode(
Charset.forName("ms932"), Newline.LF,
bufferList(Charset.forName("ms932"), "てすと1"));
assertEquals(ImmutableList.of("てすと1"), decoded);
}
@Test
public void testDecodeChunksMS932LF() throws Exception
{
List<String> decoded = doDecode(
Charset.forName("ms932"), Newline.LF,
bufferList(Charset.forName("ms932"), "て", "1", "\n", "す", "2"));
assertEquals(ImmutableList.of("て1", "す2"), decoded);
}
@Test
public void testDecodeChunksMS932CRLF() throws Exception
{
List<String> decoded = doDecode(
Charset.forName("ms932"), Newline.CRLF,
bufferList(Charset.forName("ms932"), "て", "1", "\r\n", "す", "2", "\r", "\n", "と3"));
assertEquals(ImmutableList.of("て1", "す2", "と3"), decoded);
}
}