/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package libcore.java.nio.charset; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.util.Arrays; public class CharsetEncoderTest extends junit.framework.TestCase { // None of the harmony or jtreg tests actually check that replaceWith does the right thing! public void test_replaceWith() throws Exception { Charset ascii = Charset.forName("US-ASCII"); CharsetEncoder e = ascii.newEncoder(); e.onMalformedInput(CodingErrorAction.REPLACE); e.onUnmappableCharacter(CodingErrorAction.REPLACE); e.replaceWith("=".getBytes("US-ASCII")); String input = "hello\u0666world"; String output = ascii.decode(e.encode(CharBuffer.wrap(input))).toString(); assertEquals("hello=world", output); } private void assertReplacementBytesForEncoder(String charset, byte[] bytes) { byte[] result = Charset.forName(charset).newEncoder().replacement(); assertEquals(Arrays.toString(bytes), Arrays.toString(result)); } // For all the guaranteed built-in charsets, check that we have the right default replacements. public void test_defaultReplacementBytesIso_8859_1() throws Exception { assertReplacementBytesForEncoder("ISO-8859-1", new byte[] { (byte) '?' }); } public void test_defaultReplacementBytesUs_Ascii() throws Exception { assertReplacementBytesForEncoder("US-ASCII", new byte[] { (byte) '?' }); } public void test_defaultReplacementBytesUtf_16() throws Exception { assertReplacementBytesForEncoder("UTF-16", new byte[] { (byte) 0xff, (byte) 0xfd }); } public void test_defaultReplacementBytesUtf_16be() throws Exception { assertReplacementBytesForEncoder("UTF-16BE", new byte[] { (byte) 0xff, (byte) 0xfd }); } public void test_defaultReplacementBytesUtf_16le() throws Exception { assertReplacementBytesForEncoder("UTF-16LE", new byte[] { (byte) 0xfd, (byte) 0xff }); } public void test_defaultReplacementBytesUtf_8() throws Exception { assertReplacementBytesForEncoder("UTF-8", new byte[] { (byte) '?' }); } public void testSurrogatePairAllAtOnce() throws Exception { // okay: surrogate pair seen all at once is decoded to U+20b9f. Charset cs = Charset.forName("UTF-32BE"); CharsetEncoder e = cs.newEncoder(); ByteBuffer bb = ByteBuffer.allocate(128); CoderResult cr = e.encode(CharBuffer.wrap(new char[] { '\ud842', '\udf9f' }), bb, false); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(4, bb.position()); assertEquals((byte) 0x00, bb.get(0)); assertEquals((byte) 0x02, bb.get(1)); assertEquals((byte) 0x0b, bb.get(2)); assertEquals((byte) 0x9f, bb.get(3)); } public void testMalformedSurrogatePair() throws Exception { // malformed: low surrogate first is detected as an error. Charset cs = Charset.forName("UTF-32BE"); CharsetEncoder e = cs.newEncoder(); ByteBuffer bb = ByteBuffer.allocate(128); CoderResult cr = e.encode(CharBuffer.wrap(new char[] { '\udf9f' }), bb, false); assertTrue(cr.toString(), cr.isMalformed()); assertEquals(1, cr.length()); } public void testCharsetEncoderSurrogatesBrokenByDesign_IGNORE_RI() throws Exception { testCharsetEncoderSurrogatesBrokenByDesign_RI(CodingErrorAction.IGNORE); } public void testCharsetEncoderSurrogatesBrokenByDesign_REPORT_RI() throws Exception { testCharsetEncoderSurrogatesBrokenByDesign_RI(CodingErrorAction.REPORT); } public void testCharsetEncoderSurrogatesBrokenByDesign_REPLACE_RI() throws Exception { testCharsetEncoderSurrogatesBrokenByDesign_RI(CodingErrorAction.REPLACE); } private void testCharsetEncoderSurrogatesBrokenByDesign_RI(CodingErrorAction cea) throws Exception { // stupid: on the RI, writing the two halves of the surrogate pair in separate writes // is an error because the CharsetEncoder doesn't remember it's half-way through a // surrogate pair across the two calls! // IGNORE just ignores both characters, REPORT complains that the second is // invalid (because it doesn't remember seeing the first), and REPLACE inserts a // replacement character U+fffd when it sees the second character (because it too // doesn't remember seeing the first). Charset cs = Charset.forName("UTF-32BE"); CharsetEncoder e = cs.newEncoder(); e.onMalformedInput(cea); e.onUnmappableCharacter(cea); ByteBuffer bb = ByteBuffer.allocate(128); CoderResult cr = e.encode(CharBuffer.wrap(new char[] { '\ud842' }), bb, false); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(0, bb.position()); cr = e.encode(CharBuffer.wrap(new char[] { '\udf9f' }), bb, false); if (cea == CodingErrorAction.REPORT) { assertTrue(cr.toString(), cr.isMalformed()); assertEquals(1, cr.length()); return; } assertEquals(CoderResult.UNDERFLOW, cr); int expectedPosition = 0; if (cea == CodingErrorAction.REPLACE) { expectedPosition = 4; assertEquals(expectedPosition, bb.position()); System.err.println(Arrays.toString(Arrays.copyOfRange(bb.array(), 0, bb.position()))); assertEquals((byte) 0x00, bb.get(0)); assertEquals((byte) 0x00, bb.get(1)); assertEquals((byte) 0xff, bb.get(2)); assertEquals((byte) 0xfd, bb.get(3)); } assertEquals(expectedPosition, bb.position()); cr = e.encode(CharBuffer.wrap(new char[] { }), bb, true); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(expectedPosition, bb.position()); cr = e.flush(bb); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(expectedPosition, bb.position()); } public void testCharsetEncoderSurrogatesBrokenByDesign_IGNORE() throws Exception { testCharsetEncoderSurrogatesBrokenByDesign(CodingErrorAction.IGNORE); } public void testCharsetEncoderSurrogatesBrokenByDesign_REPORT() throws Exception { testCharsetEncoderSurrogatesBrokenByDesign(CodingErrorAction.REPORT); } public void testCharsetEncoderSurrogatesBrokenByDesign_REPLACE() throws Exception { testCharsetEncoderSurrogatesBrokenByDesign(CodingErrorAction.REPLACE); } private void testCharsetEncoderSurrogatesBrokenByDesign(CodingErrorAction cea) throws Exception { // Writing the two halves of the surrogate pair in separate writes works just fine. // This is true of Android and ICU, but not of the RI. Charset cs = Charset.forName("UTF-32BE"); CharsetEncoder e = cs.newEncoder(); e.onMalformedInput(cea); e.onUnmappableCharacter(cea); ByteBuffer bb = ByteBuffer.allocate(128); CoderResult cr = e.encode(CharBuffer.wrap(new char[] { '\ud842' }), bb, false); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(0, bb.position()); cr = e.encode(CharBuffer.wrap(new char[] { '\udf9f' }), bb, false); assertEquals(CoderResult.UNDERFLOW, cr); int expectedPosition = 4; assertEquals(expectedPosition, bb.position()); System.err.println(Arrays.toString(Arrays.copyOfRange(bb.array(), 0, bb.position()))); assertEquals((byte) 0x00, bb.get(0)); assertEquals((byte) 0x02, bb.get(1)); assertEquals((byte) 0x0b, bb.get(2)); assertEquals((byte) 0x9f, bb.get(3)); cr = e.encode(CharBuffer.wrap(new char[] { }), bb, true); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(expectedPosition, bb.position()); cr = e.flush(bb); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(expectedPosition, bb.position()); } public void testFlushWithoutEndOfInput() throws Exception { Charset cs = Charset.forName("UTF-32BE"); CharsetEncoder e = cs.newEncoder(); ByteBuffer bb = ByteBuffer.allocate(128); CoderResult cr = e.encode(CharBuffer.wrap(new char[] { 'x' }), bb, false); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(4, bb.position()); try { cr = e.flush(bb); } catch (IllegalStateException expected) { // you must call encode with endOfInput true before you can flush. } // We had a bug where we wouldn't reset inEnd before calling encode in implFlush. // That would result in flush outputting garbage. cr = e.encode(CharBuffer.wrap(new char[] { 'x' }), bb, true); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(8, bb.position()); cr = e.flush(bb); assertEquals(CoderResult.UNDERFLOW, cr); assertEquals(8, bb.position()); } }