/* * Copyright 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package javaemul.internal; import java.nio.charset.Charset; /** * Provides Charset implementations. */ public abstract class EmulatedCharset extends Charset { public static final EmulatedCharset UTF_8 = new UtfCharset("UTF-8"); public static final EmulatedCharset ISO_LATIN_1 = new LatinCharset("ISO-LATIN-1"); public static final EmulatedCharset ISO_8859_1 = new LatinCharset("ISO-8859-1"); private static class LatinCharset extends EmulatedCharset { public LatinCharset(String name) { super(name); } @Override public byte[] getBytes(String str) { int n = str.length(); byte[] bytes = new byte[n]; for (int i = 0; i < n; ++i) { bytes[i] = (byte) (str.charAt(i) & 255); } return bytes; } @Override public char[] decodeString(byte[] bytes, int ofs, int len) { char[] chars = new char[len]; for (int i = 0; i < len; ++i) { chars[i] = (char) (bytes[ofs + i] & 255); } return chars; } } private static class UtfCharset extends EmulatedCharset { public UtfCharset(String name) { super(name); } @Override public char[] decodeString(byte[] bytes, int ofs, int len) { // TODO(jat): consider using decodeURIComponent(escape(bytes)) instead int charCount = 0; for (int i = 0; i < len; ) { ++charCount; byte ch = bytes[ofs + i]; if ((ch & 0xC0) == 0x80) { throw new IllegalArgumentException("Invalid UTF8 sequence"); } else if ((ch & 0x80) == 0) { ++i; } else if ((ch & 0xE0) == 0xC0) { i += 2; } else if ((ch & 0xF0) == 0xE0) { i += 3; } else if ((ch & 0xF8) == 0xF0) { i += 4; } else { // no 5+ byte sequences since max codepoint is less than 2^21 throw new IllegalArgumentException("Invalid UTF8 sequence"); } if (i > len) { throw new IndexOutOfBoundsException("Invalid UTF8 sequence"); } } char[] chars = new char[charCount]; int outIdx = 0; int count = 0; for (int i = 0; i < len; ) { int ch = bytes[ofs + i++]; if ((ch & 0x80) == 0) { count = 1; ch &= 127; } else if ((ch & 0xE0) == 0xC0) { count = 2; ch &= 31; } else if ((ch & 0xF0) == 0xE0) { count = 3; ch &= 15; } else if ((ch & 0xF8) == 0xF0) { count = 4; ch &= 7; } else if ((ch & 0xFC) == 0xF8) { count = 5; ch &= 3; } while (--count > 0) { byte b = bytes[ofs + i++]; if ((b & 0xC0) != 0x80) { throw new IllegalArgumentException("Invalid UTF8 sequence at " + (ofs + i - 1) + ", byte=" + Integer.toHexString(b)); } ch = (ch << 6) | (b & 63); } outIdx += Character.toChars(ch, chars, outIdx); } return chars; } @Override public byte[] getBytes(String str) { // TODO(jat): consider using unescape(encodeURIComponent(bytes)) instead int n = str.length(); int byteCount = 0; for (int i = 0; i < n;) { int ch = str.codePointAt(i); i += Character.charCount(ch); if (ch < (1 << 7)) { byteCount++; } else if (ch < (1 << 11)) { byteCount += 2; } else if (ch < (1 << 16)) { byteCount += 3; } else if (ch < (1 << 21)) { byteCount += 4; } else if (ch < (1 << 26)) { byteCount += 5; } } byte[] bytes = new byte[byteCount]; int out = 0; for (int i = 0; i < n;) { int ch = str.codePointAt(i); i += Character.charCount(ch); out += encodeUtf8(bytes, out, ch); } return bytes; } /** * Encode a single character in UTF8. * * @param bytes byte array to store character in * @param ofs offset into byte array to store first byte * @param codePoint character to encode * @return number of bytes consumed by encoding the character * @throws IllegalArgumentException if codepoint >= 2^26 */ private int encodeUtf8(byte[] bytes, int ofs, int codePoint) { if (codePoint < (1 << 7)) { bytes[ofs] = (byte) (codePoint & 127); return 1; } else if (codePoint < (1 << 11)) { // 110xxxxx 10xxxxxx bytes[ofs++] = (byte) (((codePoint >> 6) & 31) | 0xC0); bytes[ofs] = (byte) ((codePoint & 63) | 0x80); return 2; } else if (codePoint < (1 << 16)) { // 1110xxxx 10xxxxxx 10xxxxxx bytes[ofs++] = (byte) (((codePoint >> 12) & 15) | 0xE0); bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80); bytes[ofs] = (byte) ((codePoint & 63) | 0x80); return 3; } else if (codePoint < (1 << 21)) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx bytes[ofs++] = (byte) (((codePoint >> 18) & 7) | 0xF0); bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80); bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80); bytes[ofs] = (byte) ((codePoint & 63) | 0x80); return 4; } else if (codePoint < (1 << 26)) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx bytes[ofs++] = (byte) (((codePoint >> 24) & 3) | 0xF8); bytes[ofs++] = (byte) (((codePoint >> 18) & 63) | 0x80); bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80); bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80); bytes[ofs] = (byte) ((codePoint & 63) | 0x80); return 5; } throw new IllegalArgumentException("Character out of range: " + codePoint); } } public EmulatedCharset(String name) { super(name, null); } public abstract byte[] getBytes(String string); public abstract char[] decodeString(byte[] bytes, int ofs, int len); }