package uk.ac.ox.zoo.seeg.abraid.mp.common.util;
import org.apache.commons.io.FileUtils;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import static org.assertj.core.api.Assertions.assertThat;
/**
* Tests the CharacterSetUtils class.
* Copyright (c) 2014 University of Oxford
*/
public class CharacterSetUtilsTest {
private static final String TEST_FOLDER = "Common/test/uk/ac/ox/zoo/seeg/abraid/mp/common/util";
private static final String TEST_FILE1_UTF_8 = "file1_utf-8.txt";
private static final String TEST_FILE1_ISO_8859_1 = "file1_iso-8859-1.txt";
private static final String TEST_FILE2_UTF_8 = "file2_utf-8.txt";
private static final String TEST_FILE2_WINDOWS_1252 = "file2_windows-1252.txt";
private static final Charset WINDOWS_1252_CHARSET = Charset.forName("windows-1252");
@Test
public void detectCharacterSetReturnsNullForNullInput() {
byte[] input = null;
testDetectCharacterSet(input, null);
}
@Test
public void detectCharacterSetReturnsNullForEmptyInput() {
byte[] input = new byte[] {};
testDetectCharacterSet(input, null);
}
@Test
public void detectCharacterSetReturnsNullForASCIIInput() {
// If there are no special characters then the character set is ambiguous, so null is a reasonable response
String text = "The quick brown fox jumps over the lazy dog.";
testDetectCharacterSet(text.getBytes(), null);
}
@Test
public void detectCharacterSetReturnsUTF8ForTestUTF8File1() throws IOException {
testDetectCharacterSet(TEST_FILE1_UTF_8, StandardCharsets.UTF_8);
}
@Test
public void detectCharacterSetReturnsUTF8ForTestUTF8File2() throws IOException {
testDetectCharacterSet(TEST_FILE2_UTF_8, StandardCharsets.UTF_8);
}
@Test
public void detectCharacterSetReturnsWindows1252ForTestISO88591File() throws IOException {
// Windows-1252 is a superset of ISO-8859-1
testDetectCharacterSet(TEST_FILE1_ISO_8859_1, WINDOWS_1252_CHARSET);
}
@Test
public void detectCharacterSetReturnsWindows1252ForTestWindows1252File() throws IOException {
testDetectCharacterSet(TEST_FILE2_WINDOWS_1252, WINDOWS_1252_CHARSET);
}
@Test
public void convertToCharacterSetReturnsNullForNullInput() {
testConvertToCharacterSet((byte[]) null, null, StandardCharsets.ISO_8859_1, StandardCharsets.UTF_8);
}
@Test
public void convertToCharacterSetReturnsEmptyOutputForEmptyInput() {
byte[] empty = new byte[] {};
testConvertToCharacterSet(empty, empty, StandardCharsets.ISO_8859_1, StandardCharsets.UTF_8);
}
@Test
public void convertToCharacterSetMakesNoChangeIfSourceAndDestinationCharacterSetsAreTheSame() throws IOException {
testConvertToCharacterSet(TEST_FILE1_UTF_8, TEST_FILE1_UTF_8, StandardCharsets.UTF_8, StandardCharsets.UTF_8);
}
@Test
public void convertToCharacterSetConvertsISO88591ToUTF8Correctly() throws IOException {
testConvertToCharacterSet(TEST_FILE1_ISO_8859_1, TEST_FILE1_UTF_8, StandardCharsets.ISO_8859_1, StandardCharsets.UTF_8);
}
@Test
public void convertToCharacterSetConvertsUTF8ToISO88591Correctly() throws IOException {
testConvertToCharacterSet(TEST_FILE1_UTF_8, TEST_FILE1_ISO_8859_1, StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1);
}
@Test
public void convertToCharacterSetConvertsWindows1252ToUTF8Correctly() throws IOException {
testConvertToCharacterSet(TEST_FILE2_WINDOWS_1252, TEST_FILE2_UTF_8, WINDOWS_1252_CHARSET, StandardCharsets.UTF_8);
}
@Test
public void convertToCharacterSetConvertsUTF8ToWindows1252Correctly() throws IOException {
testConvertToCharacterSet(TEST_FILE2_UTF_8, TEST_FILE2_WINDOWS_1252, StandardCharsets.UTF_8, WINDOWS_1252_CHARSET);
}
private void testDetectCharacterSet(String inputFilename, Charset expectedCharset) throws IOException {
byte[] input = FileUtils.readFileToByteArray(new File(TEST_FOLDER, inputFilename));
testDetectCharacterSet(input, expectedCharset);
}
private void testDetectCharacterSet(byte[] input, Charset expectedCharset) {
Charset charset = CharacterSetUtils.detectCharacterSet(input);
assertThat(charset).isEqualTo(expectedCharset);
}
private void testConvertToCharacterSet(String inputFilename, String expectedOutputFilename, Charset fromCharset,
Charset toCharset) throws IOException {
byte[] input = FileUtils.readFileToByteArray(new File(TEST_FOLDER, inputFilename));
byte[] expectedOutput = FileUtils.readFileToByteArray(new File(TEST_FOLDER, expectedOutputFilename));
testConvertToCharacterSet(input, expectedOutput, fromCharset, toCharset);
}
private void testConvertToCharacterSet(byte[] input, byte[] expectedOutput, Charset fromCharset, Charset toCharset) {
byte[] output = CharacterSetUtils.convertToCharacterSet(input, fromCharset, toCharset);
assertThat(output).isEqualTo(expectedOutput);
}
}