/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flume.serialization;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.Files;
import junit.framework.Assert;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.MalformedInputException;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public class TestResettableFileInputStream {
private static final boolean CLEANUP = true;
private static final File WORK_DIR =
new File("target/test/work").getAbsoluteFile();
private static final Logger logger =
LoggerFactory.getLogger(TestResettableFileInputStream.class);
private File file;
private File meta;
@Before
public void setup() throws Exception {
Files.createParentDirs(new File(WORK_DIR, "dummy"));
file = File.createTempFile(getClass().getSimpleName(), ".txt", WORK_DIR);
logger.info("Data file: {}", file);
meta = File.createTempFile(getClass().getSimpleName(), ".avro", WORK_DIR);
logger.info("PositionTracker meta file: {}", meta);
meta.delete(); // We want the filename but not the empty file
}
@After
public void tearDown() throws Exception {
if (CLEANUP) {
meta.delete();
file.delete();
}
}
/**
* Ensure that we can simply read bytes from a file.
* @throws IOException
*/
@Test
public void testBasicRead() throws IOException {
String output = singleLineFileInit(file, Charsets.UTF_8);
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker);
String result = readLine(in, output.length());
assertEquals(output, result);
String afterEOF = readLine(in, output.length());
assertNull(afterEOF);
in.close();
}
/**
* Ensure that we can simply read bytes from a file using InputStream.read() method.
* @throws IOException
*/
@Test
public void testReadByte() throws IOException {
byte[] bytes = new byte[255];
for (int i = 0; i < 255; i++) {
bytes[i] = (byte) i;
}
Files.write(bytes, file);
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker);
for (int i = 0; i < 255; i++) {
assertEquals(i, in.read());
}
assertEquals(-1, in.read());
in.close();
}
/**
* Ensure that we can process lines that contain multi byte characters in weird places
* such as at the end of a buffer.
* @throws IOException
*/
@Test
public void testMultiByteCharRead() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
out.write("1234567".getBytes(Charsets.UTF_8));
// write a multi byte char encompassing buffer boundaries
generateUtf83ByteSequence(out);
// buffer now contains 8 chars and 10 bytes total
Files.write(out.toByteArray(), file);
ResettableInputStream in = initInputStream(8, Charsets.UTF_8, DecodeErrorPolicy.FAIL);
String result = readLine(in, 8);
assertEquals("1234567\u0A93\n", result);
}
/**
* Ensure that we can process UTF-8 lines that contain surrogate pairs
* even if they appear astride buffer boundaries.
* @throws IOException
*/
@Test
public void testUtf8SurrogatePairRead() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
out.write("1234567".getBytes(Charsets.UTF_8));
generateUtf8SurrogatePairSequence(out);
// buffer now contains 9 chars (7 "normal" and 2 surrogates) and 11 bytes total
// surrogate pair will encompass buffer boundaries
Files.write(out.toByteArray(), file);
ResettableInputStream in = initInputStream(8, Charsets.UTF_8, DecodeErrorPolicy.FAIL);
String result = readLine(in, 9);
assertEquals("1234567\uD83D\uDE18\n", result);
}
/**
* Ensure that we can process UTF-16 lines that contain surrogate pairs, even
* preceded by a Byte Order Mark (BOM).
* @throws IOException
*/
@Test
public void testUtf16BOMAndSurrogatePairRead() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
generateUtf16SurrogatePairSequence(out);
// buffer now contains 1 BOM and 2 chars (1 surrogate pair) and 6 bytes total
// (including 2-byte BOM)
Files.write(out.toByteArray(), file);
ResettableInputStream in = initInputStream(8, Charsets.UTF_16, DecodeErrorPolicy.FAIL);
String result = readLine(in, 2);
assertEquals("\uD83D\uDE18\n", result);
}
/**
* Ensure that we can process Shift_JIS lines that contain multi byte Japanese chars
* even if they appear astride buffer boundaries.
* @throws IOException
*/
@Test
public void testShiftJisSurrogateCharRead() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
out.write("1234567".getBytes(Charset.forName("Shift_JIS")));
// write a multi byte char encompassing buffer boundaries
generateShiftJis2ByteSequence(out);
// buffer now contains 8 chars and 10 bytes total
Files.write(out.toByteArray(), file);
ResettableInputStream in = initInputStream(8, Charset.forName("Shift_JIS"),
DecodeErrorPolicy.FAIL);
String result = readLine(in, 8);
assertEquals("1234567\u4E9C\n", result);
}
@Test(expected = MalformedInputException.class)
public void testUtf8DecodeErrorHandlingFailMalformed() throws IOException {
ResettableInputStream in = initUtf8DecodeTest(DecodeErrorPolicy.FAIL);
while (in.readChar() != -1) {
// Do nothing... read the whole file and throw away the bytes.
}
fail("Expected MalformedInputException!");
}
@Test
public void testUtf8DecodeErrorHandlingIgnore() throws IOException {
ResettableInputStream in = initUtf8DecodeTest(DecodeErrorPolicy.IGNORE);
int c;
StringBuilder sb = new StringBuilder();
while ((c = in.readChar()) != -1) {
sb.append((char)c);
}
assertEquals("Latin1: ()\nLong: ()\nNonUnicode: ()\n", sb.toString());
}
@Test
public void testUtf8DecodeErrorHandlingReplace() throws IOException {
ResettableInputStream in = initUtf8DecodeTest(DecodeErrorPolicy.REPLACE);
int c;
StringBuilder sb = new StringBuilder();
while ((c = in.readChar()) != -1) {
sb.append((char)c);
}
String preJdk8ExpectedStr = "Latin1: (X)\nLong: (XXX)\nNonUnicode: (X)\n";
String expectedStr = "Latin1: (X)\nLong: (XXX)\nNonUnicode: (XXXXX)\n";
String javaVersionStr = System.getProperty("java.version");
double javaVersion = Double.parseDouble(javaVersionStr.substring(0, 3));
if (javaVersion < 1.8) {
assertTrue(preJdk8ExpectedStr.replaceAll("X", "\ufffd").equals(sb.toString()));
} else {
assertTrue(expectedStr.replaceAll("X", "\ufffd").equals(sb.toString()));
}
}
@Test(expected = MalformedInputException.class)
public void testLatin1DecodeErrorHandlingFailMalformed() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
generateLatin1InvalidSequence(out);
Files.write(out.toByteArray(), file);
ResettableInputStream in = initInputStream(DecodeErrorPolicy.FAIL);
while (in.readChar() != -1) {
// Do nothing... read the whole file and throw away the bytes.
}
fail("Expected MalformedInputException!");
}
@Test
public void testLatin1DecodeErrorHandlingReplace() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
generateLatin1InvalidSequence(out);
Files.write(out.toByteArray(), file);
ResettableInputStream in = initInputStream(DecodeErrorPolicy.REPLACE);
int c;
StringBuilder sb = new StringBuilder();
while ((c = in.readChar()) != -1) {
sb.append((char)c);
}
assertEquals("Invalid: (X)\n".replaceAll("X", "\ufffd"), sb.toString());
}
/**
* Ensure a reset() brings us back to the default mark (beginning of file)
* @throws IOException
*/
@Test
public void testReset() throws IOException {
String output = singleLineFileInit(file, Charsets.UTF_8);
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker);
String result1 = readLine(in, output.length());
assertEquals(output, result1);
in.reset();
String result2 = readLine(in, output.length());
assertEquals(output, result2);
String result3 = readLine(in, output.length());
assertNull("Should be null: " + result3, result3);
in.close();
}
/**
* Ensure that marking and resetting works.
* @throws IOException
*/
@Test
public void testMarkReset() throws IOException {
List<String> expected = multiLineFileInit(file, Charsets.UTF_8);
int MAX_LEN = 100;
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker);
String result0 = readLine(in, MAX_LEN);
assertEquals(expected.get(0), result0);
in.reset();
String result0a = readLine(in, MAX_LEN);
assertEquals(expected.get(0), result0a);
in.mark();
String result1 = readLine(in, MAX_LEN);
assertEquals(expected.get(1), result1);
in.reset();
String result1a = readLine(in, MAX_LEN);
assertEquals(expected.get(1), result1a);
in.mark();
in.close();
}
/**
* Ensure that surrogate pairs work well with mark/reset.
* @throws IOException
*/
@Test
public void testMarkResetWithSurrogatePairs() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
out.write("foo".getBytes(Charsets.UTF_8));
generateUtf8SurrogatePairSequence(out);
out.write("bar".getBytes(Charsets.UTF_8));
Files.write(out.toByteArray(), file);
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker);
Assert.assertEquals('f', in.readChar());
Assert.assertEquals('o', in.readChar());
in.mark();
Assert.assertEquals('o', in.readChar());
// read high surrogate
Assert.assertEquals('\ud83d', in.readChar());
// call reset in the middle of a surrogate pair
in.reset();
// will read low surrogate *before* reverting back to mark, to ensure
// surrogate pair is properly read
Assert.assertEquals('\ude18', in.readChar());
// now back to marked position
Assert.assertEquals('o', in.readChar());
// read high surrogate again
Assert.assertEquals('\ud83d', in.readChar());
// call mark in the middle of a surrogate pair:
// will mark the position *after* the pair, *not* low surrogate's position
in.mark();
// will reset to the position *after* the pair
in.reset();
// read low surrogate normally despite of reset being called
// so that the pair is entirely read
Assert.assertEquals('\ude18', in.readChar());
Assert.assertEquals('b', in.readChar());
Assert.assertEquals('a', in.readChar());
// will reset to the position *after* the pair
in.reset();
Assert.assertEquals('b', in.readChar());
Assert.assertEquals('a', in.readChar());
Assert.assertEquals('r', in.readChar());
Assert.assertEquals(-1, in.readChar());
in.close();
tracker.close(); // redundant
}
@Test
public void testResume() throws IOException {
List<String> expected = multiLineFileInit(file, Charsets.UTF_8);
int MAX_LEN = 100;
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker);
String result0 = readLine(in, MAX_LEN);
String result1 = readLine(in, MAX_LEN);
in.mark();
String result2 = readLine(in, MAX_LEN);
Assert.assertEquals(expected.get(2), result2);
String result3 = readLine(in, MAX_LEN);
Assert.assertEquals(expected.get(3), result3);
in.close();
tracker.close(); // redundant
// create new Tracker & RIS
tracker = new DurablePositionTracker(meta, file.getPath());
in = new ResettableFileInputStream(file, tracker);
String result2a = readLine(in, MAX_LEN);
String result3a = readLine(in, MAX_LEN);
Assert.assertEquals(result2, result2a);
Assert.assertEquals(result3, result3a);
}
/**
* Ensure that surrogate pairs work well when resuming
* reading. Specifically, this test brings up special situations
* where a surrogate pair cannot be correctly decoded because
* the second character is lost.
*
* @throws IOException
*/
@Test
public void testResumeWithSurrogatePairs() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
out.write("foo".getBytes(Charsets.UTF_8));
generateUtf8SurrogatePairSequence(out);
out.write("bar".getBytes(Charsets.UTF_8));
Files.write(out.toByteArray(), file);
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker);
Assert.assertEquals('f', in.readChar());
Assert.assertEquals('o', in.readChar());
in.mark();
Assert.assertEquals('o', in.readChar());
// read high surrogate
Assert.assertEquals('\ud83d', in.readChar());
// call reset in the middle of a surrogate pair
in.reset();
// close RIS - this will cause the low surrogate char
// stored in-memory to be lost
in.close();
tracker.close(); // redundant
// create new Tracker & RIS
tracker = new DurablePositionTracker(meta, file.getPath());
in = new ResettableFileInputStream(file, tracker);
// low surrogate char is now lost - resume from marked position
Assert.assertEquals('o', in.readChar());
// read high surrogate again
Assert.assertEquals('\ud83d', in.readChar());
// call mark in the middle of a surrogate pair:
// will mark the position *after* the pair, *not* low surrogate's position
in.mark();
// close RIS - this will cause the low surrogate char
// stored in-memory to be lost
in.close();
tracker.close(); // redundant
// create new Tracker & RIS
tracker = new DurablePositionTracker(meta, file.getPath());
in = new ResettableFileInputStream(file, tracker);
// low surrogate char is now lost - resume from marked position
Assert.assertEquals('b', in.readChar());
Assert.assertEquals('a', in.readChar());
Assert.assertEquals('r', in.readChar());
Assert.assertEquals(-1, in.readChar());
in.close();
tracker.close(); // redundant
}
@Test
public void testSeek() throws IOException {
int NUM_LINES = 1000;
int LINE_LEN = 1000;
generateData(file, Charsets.UTF_8, NUM_LINES, LINE_LEN);
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker,
10 * LINE_LEN, Charsets.UTF_8, DecodeErrorPolicy.FAIL);
String line = "";
for (int i = 0; i < 9; i++) {
line = readLine(in, LINE_LEN);
}
int lineNum = Integer.parseInt(line.substring(0, 10));
assertEquals(8, lineNum);
// seek back within our buffer
long pos = in.tell();
in.seek(pos - 2 * LINE_LEN); // jump back 2 lines
line = readLine(in, LINE_LEN);
lineNum = Integer.parseInt(line.substring(0, 10));
assertEquals(7, lineNum);
// seek forward within our buffer
in.seek(in.tell() + LINE_LEN);
line = readLine(in, LINE_LEN);
lineNum = Integer.parseInt(line.substring(0, 10));
assertEquals(9, lineNum);
// seek forward outside our buffer
in.seek(in.tell() + 20 * LINE_LEN);
line = readLine(in, LINE_LEN);
lineNum = Integer.parseInt(line.substring(0, 10));
assertEquals(30, lineNum);
// seek backward outside our buffer
in.seek(in.tell() - 25 * LINE_LEN);
line = readLine(in, LINE_LEN);
lineNum = Integer.parseInt(line.substring(0, 10));
assertEquals(6, lineNum);
// test a corner-case seek which requires a buffer refill
in.seek(100 * LINE_LEN);
in.seek(0); // reset buffer
in.seek(9 * LINE_LEN);
assertEquals(9, Integer.parseInt(readLine(in, LINE_LEN).substring(0, 10)));
assertEquals(10, Integer.parseInt(readLine(in, LINE_LEN).substring(0, 10)));
assertEquals(11, Integer.parseInt(readLine(in, LINE_LEN).substring(0, 10)));
}
private ResettableInputStream initUtf8DecodeTest(DecodeErrorPolicy policy)
throws IOException {
writeBigBadUtf8Sequence(file);
return initInputStream(policy);
}
private ResettableInputStream initInputStream(DecodeErrorPolicy policy)
throws IOException {
return initInputStream(2048, Charsets.UTF_8, policy);
}
private ResettableInputStream initInputStream(int bufferSize, Charset charset,
DecodeErrorPolicy policy) throws IOException {
PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
ResettableInputStream in = new ResettableFileInputStream(file, tracker,
bufferSize, charset, policy);
return in;
}
private void writeBigBadUtf8Sequence(File file) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
generateUtf8Latin1Sequence(out);
generateUtf8OverlyLongSequence(out);
generateUtf8NonUnicodeSequence(out);
Files.write(out.toByteArray(), file);
}
private void generateUtf8OverlyLongSequence(OutputStream out)
throws IOException {
out.write("Long: (".getBytes(Charsets.UTF_8));
// Overly-long slash character should not be accepted.
out.write(new byte[] { (byte)0xe0, (byte)0x80, (byte)0xaf });
out.write(")\n".getBytes(Charsets.UTF_8));
}
private void generateUtf8NonUnicodeSequence(OutputStream out)
throws IOException {
out.write("NonUnicode: (".getBytes(Charsets.UTF_8));
// This is a valid 5-octet sequence but is not Unicode
out.write(new byte[]{(byte) 0xf8, (byte) 0xa1, (byte) 0xa1, (byte) 0xa1,
(byte) 0xa1});
out.write(")\n".getBytes(Charsets.UTF_8));
}
private void generateUtf8Latin1Sequence(OutputStream out) throws IOException {
out.write("Latin1: (".getBytes(Charsets.UTF_8));
// This is "e" with an accent in Latin-1
out.write(new byte[] { (byte)0xe9 } );
out.write(")\n".getBytes(Charsets.UTF_8));
}
private void generateLatin1InvalidSequence(OutputStream out)
throws IOException {
out.write("Invalid: (".getBytes(Charsets.UTF_8));
// Not a valid character in Latin 1.
out.write(new byte[] { (byte)0x81 } );
out.write(")\n".getBytes(Charsets.UTF_8));
}
private void generateUtf8SurrogatePairSequence(OutputStream out) throws IOException {
// U+1F618 (UTF-8: f0 9f 98 98) FACE THROWING A KISS
out.write(new byte[]{(byte) 0xF0, (byte) 0x9F, (byte) 0x98, (byte) 0x98});
}
private void generateUtf16SurrogatePairSequence(OutputStream out) throws IOException {
// BOM
out.write(new byte[]{(byte) 0xFE, (byte) 0xFF});
// U+1F618 (UTF-16: d83d de18) FACE THROWING A KISS
out.write(new byte[]{(byte) 0xD8, (byte) 0x3D, (byte) 0xDE, (byte) 0x18});
}
private void generateUtf83ByteSequence(OutputStream out) throws IOException {
// U+0A93 (UTF-8: e0 aa 93) GUJARATI LETTER O
out.write(new byte[]{(byte) 0xe0, (byte) 0xaa, (byte) 0x93});
}
private void generateShiftJis2ByteSequence(OutputStream out) throws IOException {
//U+4E9C (Shift JIS: 88 9f) CJK UNIFIED IDEOGRAPH
out.write(new byte[]{(byte) 0x88, (byte) 0x9f});
}
/**
* Helper function to read a line from a character stream.
* @param in
* @param maxLength
* @return
* @throws IOException
*/
private static String readLine(ResettableInputStream in, int maxLength)
throws IOException {
StringBuilder s = new StringBuilder();
int c;
int i = 1;
while ((c = in.readChar()) != -1) {
// FIXME: support \r\n
if (c == '\n') {
break;
}
//System.out.printf("seen char val: %c\n", (char)c);
s.append((char)c);
if (i++ > maxLength) {
System.out.println("Output: >" + s + "<");
throw new RuntimeException("Too far!");
}
}
if (s.length() > 0) {
s.append('\n');
return s.toString();
} else {
return null;
}
}
private static String singleLineFileInit(File file, Charset charset)
throws IOException {
String output = "This is gonna be great!\n";
Files.write(output.getBytes(charset), file);
return output;
}
private static List<String> multiLineFileInit(File file, Charset charset)
throws IOException {
List<String> lines = Lists.newArrayList();
lines.add("1. On the planet of Mars\n");
lines.add("2. They have clothes just like ours,\n");
lines.add("3. And they have the same shoes and same laces,\n");
lines.add("4. And they have the same charms and same graces...\n");
StringBuilder sb = new StringBuilder();
for (String line : lines) {
sb.append(line);
}
Files.write(sb.toString().getBytes(charset), file);
return lines;
}
private static void generateData(File file, Charset charset,
int numLines, int lineLen) throws IOException {
OutputStream out = new BufferedOutputStream(new FileOutputStream(file));
StringBuilder junk = new StringBuilder();
for (int x = 0; x < lineLen - 13; x++) {
junk.append('x');
}
String payload = junk.toString();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < numLines; i++) {
builder.append(String.format("%010d: %s\n", i, payload));
if (i % 1000 == 0 && i != 0) {
out.write(builder.toString().getBytes(charset));
builder.setLength(0);
}
}
out.write(builder.toString().getBytes(charset));
out.close();
Assert.assertEquals(lineLen * numLines, file.length());
}
}