package org.icij.extract.spewer; import java.io.IOException; import java.io.InputStream; import java.io.ByteArrayOutputStream; import java.io.ByteArrayInputStream; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.apache.tika.exception.TikaException; import org.icij.extract.document.DocumentFactory; import org.icij.extract.document.PathIdentifier; import org.icij.extract.parser.ParsingReader; import org.junit.Test; import org.junit.Assert; public class PrintStreamSpewerTest { private final DocumentFactory factory = new DocumentFactory().withIdentifier(new PathIdentifier()); @Test public void testWrite() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); final String buffer = "$"; final String name = "imaginary-file.txt"; final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8)); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); spewer.write(factory.create(name), reader); Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name())); Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2)); } @Test public void testWriteFromUTF16LE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); final byte[] buffer = new byte[] {(byte) 0xFF, (byte) 0xFE, 0x24, 0x00}; final String name = "imaginary-file.txt"; final InputStream inputStream = new ByteArrayInputStream(buffer); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); spewer.write(factory.create(name), reader); Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name())); Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2)); } @Test public void testWriteFromUTF16BE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); final byte[] buffer = new byte[] {(byte) 0xFE, (byte) 0xFF, 0x00, 0x24}; final String name = "imaginary-file.txt"; final InputStream inputStream = new ByteArrayInputStream(buffer); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); spewer.write(factory.create(name), reader); Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name())); Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2)); } @Test public void testWriteToUTF16LE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); // Declare file contents of a single dollar sign ($). final String buffer = "\u0024"; final String name = "imaginary-file.txt"; // Tika parsers always output UTF-8. final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8)); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); spewer.setOutputEncoding(StandardCharsets.UTF_16LE); spewer.write(factory.create("test-file"), reader); Assert.assertArrayEquals(new byte[] {0x24, 0x00, 0x0A, 0x00}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 4)); } @Test public void testWriteToUTF16BE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); // Declare file contents of a single dollar sign ($). final String buffer = "\u0024"; final String name = "imaginary-file.txt"; // Tika parsers always output UTF-8. final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8)); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); spewer.setOutputEncoding(StandardCharsets.UTF_16BE); spewer.write(factory.create("test-file"), reader); Assert.assertArrayEquals(new byte[] {0x00, 0x24, 0x00, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 4)); } }