// BlogBridge -- RSS feed reader, manager, and web based service // Copyright (C) 2002-2006 by R. Pito Salas // // This program is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free Software Foundation; // either version 2 of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; // without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along with this program; // if not, write to the Free Software Foundation, Inc., 59 Temple Place, // Suite 330, Boston, MA 02111-1307 USA // // Contact: R. Pito Salas // mailto:pitosalas@users.sourceforge.net // More information: about BlogBridge // http://www.blogbridge.com // http://sourceforge.net/projects/blogbridge // // $Id: TestUTF8Reader.java,v 1.6 2006/01/08 05:28:19 kyank Exp $ // package com.salas.bb.utils.xml; import junit.framework.TestCase; import org.jdom.Document; import org.jdom.input.SAXBuilder; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.util.logging.ConsoleHandler; import java.util.logging.Level; import java.util.logging.Logger; /** * @see UTF8Reader */ public class TestUTF8Reader extends TestCase { private static final Logger LOG = Logger.getLogger(TestUTF8Reader.class.getName()); /** * Tests reading plain ASCII characters. */ public void testPlainASCII() { char[] src = new char[128]; int[] dst = new int[128]; for (int i = 0; i < src.length; i++) { src[i] = (char)i; dst[i] = i; } check(src, dst); } /** * Tests 2-byte multi-byte sequences. */ public void test2Byte() { // First possible 2-byte sequence 0x80 check(new char[] { 't', (char)0xc2, (char)0x80, 'e' }, new int[] { 't', 0x80, 'e' }); check(new char[] { (char)0xc2, (char)0x80 }, new int[] { 0x80 }); // Something in the middle 0xa9 check(new char[] { 't', (char)0xc2, (char)0xa9, 'e' }, new int[] { 't', 0xa9, 'e' }); check(new char[] { (char)0xc2, (char)0xa9 }, new int[] { 0xa9 }); // Last possible 2-byte sequence 0x7ff check(new char[] { 't', (char)0xdf, (char)0xbf, 'e' }, new int[] { 't', 0x7ff, 'e' }); check(new char[] { (char)0xdf, (char)0xbf }, new int[] { 0x7ff }); } /** * Tests 3-byte multi-byte sequences. */ public void test3Byte() { // First possible 3-byte sequence 0x800 check(new char[] { 't', (char)0xe0, (char)0xa0, (char)0x80, 'e' }, new int[] { 't', 0x800, 'e' }); check(new char[] { (char)0xe0, (char)0xa0, (char)0x80 }, new int[] { 0x800 }); // Something in the middle 0xd7ff check(new char[] { 't', (char)0xed, (char)0x9f, (char)0xbf, 'e' }, new int[] { 't', 0xd7ff, 'e' }); check(new char[] { (char)0xed, (char)0x9f, (char)0xbf }, new int[] { 0xd7ff }); // Last possible 3-byte sequence 0xffff check(new char[] { 't', (char)0xef, (char)0xbf, (char)0xbf, 'e' }, new int[] { 't', 0xffff, 'e' }); check(new char[] { (char)0xef, (char)0xbf, (char)0xbf }, new int[] { 0xffff }); } /** * Tests 4-byte multi-byte sequences. */ public void test4Byte() { // First possible 4-byte sequence 0x100000 check(new char[] { 't', (char)0xf0, (char)0x90, (char)0x80, (char)0x80, 'e' }, new int[] { 't', 0xD800, 0xDC00, 'e' }); check(new char[] { (char)0xf0, (char)0x90, (char)0x80, (char)0x80 }, new int[] { 0xD800, 0xDC00 }); // Something in the middle 0x10ffff check(new char[] { 't', (char)0xf4, (char)0x8f, (char)0xbf, (char)0xbf, 'e' }, new int[] { 't', 0xDBFF, 0xDFFF, 'e' }); check(new char[] { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xbf }, new int[] { 0xDBFF, 0xDFFF }); // Last possible 4-byte sequence 0x1fffff // check(new char[] { 't', (char)0xf7, (char)0xbf, (char)0xbf, (char)0xbf, 'e' }, // new int[] { 't', 0x1fffff, 'e' }); // check(new char[] { (char)0xf7, (char)0xbf, (char)0xbf, (char)0xbf }, // new int[] { 0x1fffff }); } // --------------------------------------------------------------------------------------------- // Malformed sequences // --------------------------------------------------------------------------------------------- /** * Tests parsing of unexpected continuation byte. */ public void testFirstContinuationByte() { check(new char[] { 't', (char)0x80, 'e' }, new int[] { 't', 0x80, 'e' }); check(new char[] { (char)0x80 }, new int[] { 0x80 }); } /** * Tests parsing of unexpected continuation byte. */ public void testLastContinuationByte() { check(new char[] { 't', (char)0xbf, 'e' }, new int[] { 't', 0xbf, 'e' }); check(new char[] { (char)0xbf }, new int[] { 0xbf }); } /** * Tests parsing of unexpected continuation bytes in group. */ public void testContinuationBytes() { // 2 continuation bytes check(new char[] { 't', (char)0x80, (char)0xbf, 'e' }, new int[] { 't', 0x80, 0xbf, 'e' }); check(new char[] { (char)0x80, (char)0xbf }, new int[] { 0x80, 0xbf }); // 3 continuation bytes check(new char[] { 't', (char)0x80, (char)0xbf, (char)0x80, 'e' }, new int[] { 't', 0x80, 0xbf, 0x80, 'e' }); check(new char[] { (char)0x80, (char)0xbf, (char)0x80 }, new int[] { 0x80, 0xbf, 0x80 }); // 4 continuation bytes check(new char[] { 't', (char)0x80, (char)0xbf, (char)0x80, (char)0xbf, 'e' }, new int[] { 't', 0x80, 0xbf, 0x80, 0xbf, 'e' }); check(new char[] { (char)0x80, (char)0xbf, (char)0x80, (char)0xbf }, new int[] { 0x80, 0xbf, 0x80, 0xbf }); // 5 continuation bytes check(new char[] { 't', (char)0x80, (char)0xbf, (char)0x80, (char)0xbf, (char)0x80, 'e' }, new int[] { 't', 0x80, 0xbf, 0x80, 0xbf, 0x80, 'e' }); check(new char[] { (char)0x80, (char)0xbf, (char)0x80, (char)0xbf, (char)0x80 }, new int[] { 0x80, 0xbf, 0x80, 0xbf, 0x80 }); } /** * Tests parsing of unexpected continuation bytes. */ public void testSequenceOfAll64PossibleContinuationBytes() { char[] src = new char[64]; int[] dst = new int[64]; for (int i = 0; i < src.length; i++) { int ch = 0x80 + i; src[i] = (char)ch; dst[i] = ch; } check(src, dst); } /** * Tests parsing of unexpected first bytes of 2-byte multi-byte sequence. */ public void testFirstBytesOf2ByteSequnces() { char[] src = new char[64]; int[] dst = new int[64]; for (int i = 0; i < 32; i++) { int ch = 0xc0 + i; src[i*2] = (char)ch; src[i*2 + 1] = ' '; dst[i*2] = ch; dst[i*2 + 1] = ' '; } check(src, dst); } /** * Tests parsing of unexpected first bytes of 3-byte multi-byte sequence. */ public void testFirstBytesOf3ByteSequnces() { char[] src = new char[32]; int[] dst = new int[32]; for (int i = 0; i < 16; i++) { int ch = 0xe0 + i; src[i*2] = (char)ch; src[i*2 + 1] = ' '; dst[i*2] = ch; dst[i*2 + 1] = ' '; } check(src, dst); } /** * Tests parsing of unexpected first bytes of 4-byte multi-byte sequence. */ public void testFirstBytesOf4ByteSequnces() { char[] src = new char[16]; int[] dst = new int[16]; for (int i = 0; i < 8; i++) { int ch = 0xf0 + i; src[i*2] = (char)ch; src[i*2 + 1] = ' '; dst[i*2] = ch; dst[i*2 + 1] = ' '; } check(src, dst); } /** * Tests sequences without last continuation byte. */ public void testSeqencesWithLastContinuationByteMissing() { check(new char[] { (char)0xc0 }, new int[] { 0xc0 }); check(new char[] { (char)0xe0, (char)0x80 }, new int[] { 0xe0, 0x80 }); check(new char[] { (char)0xf0, (char)0x80, (char)0x80 }, new int[] { 0xf0, 0x80, 0x80 }); // All sequences joined check(new char[] { (char)0xc0, (char)0xe0, (char)0x80, (char)0xf0, (char)0x80, (char)0x80 }, new int[] { 0xc0, 0xe0, 0x80, 0xf0, 0x80, 0x80 }); } /** * Tests different unexpected combinations. */ public void testGoodBadCombinations() { check(new char[] { (char)0x80, (char)0xc2, (char)0xa9, (char)0x80 }, new int[] { 0x80, 0xa9, 0x80 }); check(new char[] { 'a', (char)0xc2, (char)0xa9, (char)0xbb, 'b'}, new int[] { 'a', 0xa9, 0xbb, 'b' }); check(new char[] { 'a', (char)0xc2, (char)0xa9, (char)0xe1, 'b'}, new int[] { 'a', 0xa9, 0xe1, 'b' }); } /** * Tests sequences broken into several packet blocks. */ public void testBrockenIntoBlocks() { check(new char[][] { new char[] { (char)0xc2 }, new char[] { (char)0xa9 } }, new int[] { 0xa9 }); check(new char[][] { new char[] { (char)0xc2 }, new char[] { (char)0x09 } }, new int[] { 0xc2, 0x09 }); check(new char[][] { new char[] { (char)0xe2, 0x80 }, new char[] { (char)0x09 } }, new int[] { 0xe2, 0x80, 0x09 }); } /** * Small real-life practice test. * * @throws Exception in case of error. */ public void testPracticeTest() throws Exception { String s = "<?xml version=\"1.0\" encoding=\"utf-8\"?><root>a" + (char)0xc2 + (char)0xa9 + (char)0xbb + "b</root>"; SAXBuilder builder = new SAXBuilder(); Reader reader = XmlReaderFactory.create(streamForString(s.toCharArray())); Document doc = builder.build(reader); String text = doc.getRootElement().getText(); assertEquals('a', text.charAt(0)); assertEquals('\u00a9', text.charAt(1)); assertEquals('\u00bb', text.charAt(2)); assertEquals('b', text.charAt(3)); } /** * Check how source is read and if it matches destination. * * @param src source. * @param dst destination. */ private void check(char[] src, int[] dst) { Reader reader = createReader(src); check0(reader, dst); } /** * Tests how several source blocks are read into single stream. * * @param srcs source packet blocks. * @param dst destination. */ private void check(char[][] srcs, int[] dst) { Reader reader = createReader(srcs); check0(reader, dst); } /** * Checks if the reader will return exactly destination sequence. * * @param reader reader. * @param dst destination. */ private void check0(Reader reader, int[] dst) { try { int offset = 0; int ch; while ((ch = reader.read()) != -1) { if (LOG.isLoggable(Level.FINE)) { LOG.fine("Pos: " + offset + " Expected: " + Integer.toHexString(dst[offset]) + " Found: " + Integer.toHexString(ch)); } assertEquals("Pos: " + offset, dst[offset], ch); offset++; } assertEquals("Not all bytes read.", dst.length, offset); } catch (IOException e) { e.printStackTrace(); fail(); } } /** * Creates our reader for "reading" of sequence of chars. * * @param chars source sequence. * * @return reader. */ private UTF8Reader createReader(char[] chars) { return new UTF8Reader(streamForString(chars)); } /** * Creates our reader for "reading" of sequence of charse broken into packet blocks. * * @param charsList list of packets. * * @return reader. */ private UTF8Reader createReader(char[][] charsList) { InputStream[] streams = new InputStream[charsList.length]; for (int i = 0; i < charsList.length; i++) { char[] chars = charsList[i]; streams[i] = streamForString(chars); } return new UTF8Reader(new CombinedInputStream(streams)); } /** * Simple configurer of FINE logging. */ private void setFineLogging() { LOG.setLevel(Level.FINE); ConsoleHandler handler = new ConsoleHandler(); handler.setLevel(Level.FINE); LOG.addHandler(handler); } /** * Sets how combined stream does its job. */ public void testCombinedInputStream() throws IOException { InputStream[] streams = new InputStream[] { streamForString("abcdefg".toCharArray()), streamForString("gfedcba".toCharArray()), streamForString("1".toCharArray()), streamForString("ab".toCharArray()) }; InputStream is = new CombinedInputStream(streams); byte[] buf = new byte[10]; try { assertEquals(7, is.read(buf)); assertEquals(7, is.read(buf)); assertEquals(1, is.read(buf)); assertEquals(2, is.read(buf)); assertEquals(-1, is.read(buf)); } catch (IOException e) { e.printStackTrace(); fail(); } finally { is.close(); } } /** * Simple combined stream for combining several packet blocks into single stream, * but through several reads. */ private static class CombinedInputStream extends InputStream { private InputStream[] streams; private int current; public CombinedInputStream(InputStream[] streams) { this.streams = streams; current = 0; } public int read() throws IOException { int ch = -1; while (ch == -1 && current < streams.length) { InputStream currentStream = streams[current]; ch = currentStream.read(); if (ch < 0) current++; } return ch; } public int read(byte b[]) throws IOException { int read = -1; // No checks here because it's only for testing purposes if (current < streams.length) { read = streams[current].read(b); if (read <= 0) { current++; if (current < streams.length) read = streams[current].read(b); } } return read; } public int read(byte b[], int off, int len) throws IOException { int read = -1; // No checks here because it's only for testing purposes if (current < streams.length) { read = streams[current].read(b, off, len); if (read <= 0) { current++; if (current < streams.length) read = streams[current].read(b, off, len); } } return read; } } /** Creates stream for string. */ private InputStream streamForString(char[] text) { byte[] bytes = new byte[text.length]; for (int i = 0; i < text.length; i++) bytes[i] = (byte)text[i]; return new ByteArrayInputStream(bytes); } }