/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.atlas.io;
import java.io.ByteArrayOutputStream ;
import java.io.IOException ;
import java.io.OutputStreamWriter ;
import java.io.Writer ;
import java.nio.Buffer ;
import java.nio.ByteBuffer ;
import java.nio.CharBuffer ;
import java.nio.charset.Charset ;
import java.nio.charset.CharsetDecoder ;
import java.nio.charset.CharsetEncoder ;
import org.apache.jena.atlas.io.BlockUTF8 ;
import org.apache.jena.atlas.junit.BaseTest ;
import org.apache.jena.atlas.lib.Chars ;
import org.junit.Test ;
public class TestBlockUTF8 extends BaseTest
{
// Need array and non-array versions.
static Charset utf8 = Chars.charsetUTF8 ;
static CharsetDecoder dec = utf8.newDecoder() ;
static CharsetEncoder enc = utf8.newEncoder() ;
// UTF-8 encoding.
// character '¢' = code point U+00A2 -> C2 A2
// character '€' = code point U+20AC -> E2 82 AC
static private final String asciiBase = "abc" ;
static private final String latinBase = "Àéíÿ" ;
static private final String latinExtraBase = "ỹfifl" ; // fi-ligature, fl-ligature
static private final String greekBase = "αβγ" ;
static private final String hewbrewBase = "אבג" ;
static private final String arabicBase = "ءآأ";
static private final String symbolsBase = "☺☻♪♫" ;
static private final String chineseBase = "孫子兵法" ; // The Art of War
static private final String japaneseBase = "日本" ; // Japanese
static private final String binaryStr1 = "abc\uD800xyz" ; // A single surrogate, without it's pair.
static private final String binaryStr2 = "\uD800" ; // A single surrogate, without it's pair.
static private final String binaryStr3 = "\u0000" ; // A zero character
static private final byte[] binaryBytes1 = {} ;
static private final byte[] binaryBytes2 = { (byte)0x00 } ; // Java encoding of 0 codepoint is 0
static private final byte[] binaryBytes3 = { (byte)0xC0, (byte)0x80 } ; // Modifed unicode zero codepoint.
@Test public void convert_in_00() { testIn("") ; }
@Test public void convert_in_01() { testIn(asciiBase) ; }
@Test public void convert_in_02() { testIn(latinBase) ; }
@Test public void convert_in_03() { testIn(latinExtraBase) ; }
@Test public void convert_in_04() { testIn(greekBase) ; }
@Test public void convert_in_05() { testIn(hewbrewBase) ; }
@Test public void convert_in_06() { testIn(arabicBase) ; }
@Test public void convert_in_07() { testIn(symbolsBase) ; }
@Test public void convert_in_08() { testIn(chineseBase) ; }
@Test public void convert_in_09() { testIn(japaneseBase) ; }
@Test public void convert_in_10() { testInOutBinary(binaryStr1) ; }
@Test public void convert_in_11() { testInOutBinary(binaryStr2) ; }
@Test public void convert_in_12() { testInOutBinary(binaryStr3) ; }
@Test public void convert_out_00() { testOut("") ; }
@Test public void convert_out_01() { testOut(asciiBase) ; }
@Test public void convert_out_02() { testOut(latinBase) ; }
@Test public void convert_out_03() { testOut(latinExtraBase) ; }
@Test public void convert_out_04() { testOut(greekBase) ; }
@Test public void convert_out_05() { testOut(hewbrewBase) ; }
@Test public void convert_out_06() { testOut(arabicBase) ; }
@Test public void convert_out_07() { testOut(symbolsBase) ; }
@Test public void convert_out_08() { testOut(chineseBase) ; }
@Test public void convert_out_09() { testOut(japaneseBase) ; }
@Test public void convert_out_10() { testOut(binaryStr1) ; }
@Test public void convert_out_11() { testOut(binaryStr2) ; }
@Test public void convert_out_12() { testOut(binaryStr3) ; }
// While it is key is chars->bytes-chars, we also test bytes->bytes
@Test public void binary_01() { testBinary(binaryBytes1) ; }
@Test public void binary_02() { testBinary(binaryBytes2) ; }
@Test public void binary_03() { testBinary(binaryBytes3, binaryBytes2) ; }
@Test public void binary_10() { testBinary(binaryBytes2, CharBuffer.wrap(binaryStr3)) ; }
@Test public void binary_11() { testBinary(binaryBytes3, CharBuffer.wrap(binaryStr3)) ; }
static void testIn(String x)
{
testIn(x, allocByteBufferArray, allocCharBufferArray) ;
testIn(x, allocByteBufferDirect, allocCharBufferDirect) ;
}
static void testIn(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> allocCB)
{
// Test as binary.
testInOutBinary(x) ;
// Now test, comparing to std Java.
// Correct answer, in bytes
ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)) ;
// To bytes.stringAsBytes
int N = x.length() ;
CharBuffer cb = CharBuffer.wrap(x.toCharArray()) ;
ByteBuffer bb = allocBB.allocate(4*N) ;
BlockUTF8.fromChars(cb, bb) ;
bb.flip() ;
assertTrue("Bytes", sameBytes(bytes, bb)) ;
// From bytes.
CharBuffer cb2 = allocCB.allocate(N) ;
BlockUTF8.toChars(bb, cb2) ;
cb2.flip() ;
String str = cb2.toString() ;
assertEquals(x, str) ;
}
// Tesing, but not against what Java would do (it replaces bad chars, we want binary).
static void testInOutBinary(String x)
{
int N = x.length() ;
CharBuffer cb = CharBuffer.wrap(x.toCharArray()) ;
ByteBuffer bb = ByteBuffer.allocate(4*N) ;
BlockUTF8.fromChars(cb, bb) ;
bb.flip() ;
CharBuffer cb2 = CharBuffer.allocate(N) ;
BlockUTF8.toChars(bb, cb2) ;
// compare cb and cb2.
String str = new String(cb2.array(), 0, cb2.position()) ;
assertEquals(x, str) ;
// And re-code as bytes.
CharBuffer cb3 = CharBuffer.wrap(x.toCharArray()) ;
ByteBuffer bb3 = ByteBuffer.allocate(4*N) ;
BlockUTF8.fromChars(cb3, bb3) ;
bb3.flip() ;
assertArrayEquals(bb.array(), bb3.array()) ;
}
static void testOut(String x)
{
testOut(x, allocByteBufferArray, allocCharBufferArray) ;
testOut(x, allocByteBufferDirect, allocCharBufferDirect) ;
}
static interface Alloc<T extends Buffer> { T allocate(int len) ; }
static Alloc<ByteBuffer> allocByteBufferArray = new Alloc<ByteBuffer>() {
@Override public ByteBuffer allocate(int len) { return ByteBuffer.allocate(len) ; }
} ;
static Alloc<ByteBuffer> allocByteBufferDirect = new Alloc<ByteBuffer>() {
@Override public ByteBuffer allocate(int len) { return ByteBuffer.allocateDirect(len) ; }
} ;
static Alloc<CharBuffer> allocCharBufferArray = new Alloc<CharBuffer>() {
@Override public CharBuffer allocate(int len) { return CharBuffer.allocate(len) ; }
} ;
static Alloc<CharBuffer> allocCharBufferDirect = new Alloc<CharBuffer>() {
@Override public CharBuffer allocate(int len) { return ByteBuffer.allocateDirect(2*len).asCharBuffer() ; }
} ;
static void testOut(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> allocCB)
{
testBinary(stringAsBytes(x)) ;
int N = x.length() ;
// First - get bytes the Java way.
ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)) ;
CharBuffer cb = allocCB.allocate(N) ;
BlockUTF8.toChars(bytes, cb) ;
cb.flip() ;
bytes.flip() ;
String str = cb.toString() ;
ByteBuffer bytes2 = allocBB.allocate(bytes.capacity()) ;
BlockUTF8.fromChars(cb, bytes2) ;
bytes2.flip() ;
assertTrue("Chars", sameBytes(bytes, bytes2)) ;
}
static void testBinary(byte[] binary, CharBuffer chars)
{
int N = binary.length ;
ByteBuffer bytes = ByteBuffer.wrap(binary) ;
CharBuffer cb = CharBuffer.allocate(N) ;
BlockUTF8.toChars(bytes, cb) ;
cb.flip() ;
assertTrue("Binary", sameChars(chars, cb));
}
static void testBinary(byte[] binary)
{
testBinary(binary, binary) ;
}
static void testBinary(byte[] binary, byte[] expected)
{
int N = binary.length ;
ByteBuffer bytes = ByteBuffer.wrap(binary) ;
CharBuffer cb = CharBuffer.allocate(N) ;
BlockUTF8.toChars(bytes, cb) ;
cb.flip() ;
bytes.position(0) ;
ByteBuffer bytes2 = ByteBuffer.allocate(2*N) ; // Null bytes get expanded.
BlockUTF8.fromChars(cb, bytes2) ;
bytes2.flip() ;
sameBytes(bytes, bytes2) ;
assertTrue("Binary", sameBytes(ByteBuffer.wrap(expected), bytes2)) ;
}
// Does not move position.
static boolean sameBytes(ByteBuffer bb1, ByteBuffer bb2)
{
if ( bb1.remaining() != bb2.remaining() ) return false ;
for ( int i = 0 ; i < bb1.remaining() ; i++ )
if ( bb1.get(i+bb1.position()) != bb2.get(i+bb2.position()) ) return false ;
return true ;
}
// Does not move position.
static boolean sameChars(CharBuffer cb1, CharBuffer cb2)
{
if ( cb1.remaining() != cb2.remaining() ) return false ;
for ( int i = 0 ; i < cb1.remaining() ; i++ )
if ( cb1.get(i+cb1.position()) != cb2.get(i+cb2.position()) ) return false ;
return true ;
}
static byte[] stringAsBytes(String x)
{
try {
ByteArrayOutputStream bout = new ByteArrayOutputStream() ;
try(Writer out = new OutputStreamWriter(bout, utf8)) {
out.write(x) ;
}
byte[] bytes = bout.toByteArray() ;
return bytes ;
} catch (IOException ex) { throw new RuntimeException(ex) ; }
}
}