package com.intellectualcrafters.json; /** * Kim makes immutable eight bit Unicode strings. If the MSB of a byte is set, then the next byte is a continuation * byte. The last byte of a character never has the MSB reset. Every byte that is not the last byte has the MSB set. Kim * stands for "Keep it minimal". A Unicode character is never longer than 3 bytes. Every byte contributes 7 bits to the * character. ASCII is unmodified. * * Kim UTF-8 one byte U+007F U+007F two bytes U+3FFF U+07FF three bytes U+10FFF U+FFFF four bytes U+10FFFF * * Characters in the ranges U+0800..U+3FFF and U+10000..U+10FFFF will be one byte smaller when encoded in Kim compared * to UTF-8. * * Kim is beneficial when using scripts such as Old South Arabian, Aramaic, Avestan, Balinese, Batak, Bopomofo, * Buginese, Buhid, Carian, Cherokee, Coptic, Cyrillic, Deseret, Egyptian Hieroglyphs, Ethiopic, Georgian, Glagolitic, * Gothic, Hangul Jamo, Hanunoo, Hiragana, Kanbun, Kaithi, Kannada, Katakana, Kharoshthi, Khmer, Lao, Lepcha, Limbu, * Lycian, Lydian, Malayalam, Mandaic, Meroitic, Miao, Mongolian, Myanmar, New Tai Lue, Ol Chiki, Old Turkic, Oriya, * Osmanya, Pahlavi, Parthian, Phags-Pa, Phoenician, Samaritan, Sharada, Sinhala, Sora Sompeng, Tagalog, Tagbanwa, * Takri, Tai Le, Tai Tham, Tamil, Telugu, Thai, Tibetan, Tifinagh, UCAS. * * A kim object can be constructed from an ordinary UTF-16 string, or from a byte array. A kim object can produce a * UTF-16 string. * * As with UTF-8, it is possible to detect character boundaries within a byte sequence. UTF-8 is one of the world's * great inventions. While Kim is more efficient, it is not clear that it is worth the expense of transition. * * @version 2013-04-18 */ public class Kim { /** * The number of bytes in the kim. The number of bytes can be as much as three times the number of characters. */ public int length = 0; /** * The byte array containing the kim's content. */ private byte[] bytes = null; /** * The kim's hashcode, conforming to Java's hashcode conventions. */ private int hashcode = 0; /** * The memoization of toString(). */ private String string = null; /** * Make a kim from a portion of a byte array. * * @param bytes A byte array. * @param from The index of the first byte. * @param thru The index of the last byte plus one. */ public Kim(final byte[] bytes, final int from, final int thru) { // As the bytes are copied into the new kim, a hashcode is computed // using a // modified Fletcher code. int sum = 1; int value; hashcode = 0; length = thru - from; if (length > 0) { this.bytes = new byte[length]; for (int at = 0; at < length; at += 1) { value = bytes[at + from] & 0xFF; sum += value; hashcode += sum; this.bytes[at] = (byte) value; } hashcode += sum << 16; } } /** * Make a kim from a byte array. * * @param bytes The byte array. * @param length The number of bytes. */ public Kim(final byte[] bytes, final int length) { this(bytes, 0, length); } /** * Make a new kim from a substring of an existing kim. The coordinates are in byte units, not character units. * * @param kim The source of bytes. * @param from The point at which to take bytes. * @param thru The point at which to stop taking bytes. */ public Kim(final Kim kim, final int from, final int thru) { this(kim.bytes, from, thru); } /** * Make a kim from a string. * * @param string The string. * * @throws JSONException if surrogate pair mismatch. */ public Kim(final String string) throws JSONException { final int stringLength = string.length(); hashcode = 0; length = 0; // First pass: Determine the length of the kim, allowing for the UTF-16 // to UTF-32 conversion, and then the UTF-32 to Kim conversion. if (stringLength > 0) { for (int i = 0; i < stringLength; i += 1) { final int c = string.charAt(i); if (c <= 0x7F) { length += 1; } else if (c <= 0x3FFF) { length += 2; } else { if ((c >= 0xD800) && (c <= 0xDFFF)) { i += 1; final int d = string.charAt(i); if ((c > 0xDBFF) || (d < 0xDC00) || (d > 0xDFFF)) { throw new JSONException("Bad UTF16"); } } length += 3; } } // Second pass: Allocate a byte array and fill that array with the // conversion // while computing the hashcode. bytes = new byte[length]; int at = 0; int b; int sum = 1; for (int i = 0; i < stringLength; i += 1) { int character = string.charAt(i); if (character <= 0x7F) { bytes[at] = (byte) character; sum += character; hashcode += sum; at += 1; } else if (character <= 0x3FFF) { b = 0x80 | (character >>> 7); bytes[at] = (byte) b; sum += b; hashcode += sum; at += 1; b = character & 0x7F; bytes[at] = (byte) b; sum += b; hashcode += sum; at += 1; } else { if ((character >= 0xD800) && (character <= 0xDBFF)) { i += 1; character = (((character & 0x3FF) << 10) | (string.charAt(i) & 0x3FF)) + 65536; } b = 0x80 | (character >>> 14); bytes[at] = (byte) b; sum += b; hashcode += sum; at += 1; b = 0x80 | ((character >>> 7) & 0xFF); bytes[at] = (byte) b; sum += b; hashcode += sum; at += 1; b = character & 0x7F; bytes[at] = (byte) b; sum += b; hashcode += sum; at += 1; } } hashcode += sum << 16; } } /** * Returns the number of bytes needed to contain the character in Kim format. * * @param character a Unicode character between 0 and 0x10FFFF. * * @return 1, 2, or 3 * * @throws JSONException if the character is not representable in a kim. */ public static int characterSize(final int character) throws JSONException { if ((character < 0) || (character > 0x10FFFF)) { throw new JSONException("Bad character " + character); } return character <= 0x7F ? 1 : character <= 0x3FFF ? 2 : 3; } /** * Returns the character at the specified index. The index refers to byte values and ranges from 0 to length - 1. * The index of the next character is at index + Kim.characterSize(kim.characterAt(index)). * * @param at the index of the char value. The first character is at 0. * * @throws JSONException if at does not point to a valid character. * @return a Unicode character between 0 and 0x10FFFF. */ public int characterAt(final int at) throws JSONException { final int c = get(at); if ((c & 0x80) == 0) { return c; } int character; final int c1 = get(at + 1); if ((c1 & 0x80) == 0) { character = ((c & 0x7F) << 7) | c1; if (character > 0x7F) { return character; } } else { final int c2 = get(at + 2); character = ((c & 0x7F) << 14) | ((c1 & 0x7F) << 7) | c2; if (((c2 & 0x80) == 0) && (character > 0x3FFF) && (character <= 0x10FFFF) && ((character < 0xD800) || (character > 0xDFFF))) { return character; } } throw new JSONException("Bad character at " + at); } /** * Copy the contents of this kim to a byte array. * * @param bytes A byte array of sufficient size. * @param at The position within the byte array to take the byes. * * @return The position immediately after the copy. */ public int copy(final byte[] bytes, final int at) { System.arraycopy(this.bytes, 0, bytes, at, length); return at + length; } /** * Two kim objects containing exactly the same bytes in the same order are equal to each other. * * @param obj the other kim with which to compare. * * @return true if this and obj are both kim objects containing identical byte sequences. */ @Override public boolean equals(final Object obj) { if (!(obj instanceof Kim)) { return false; } final Kim that = (Kim) obj; if (this == that) { return true; } if (hashcode != that.hashcode) { return false; } return java.util.Arrays.equals(bytes, that.bytes); } /** * Get a byte from a kim. * * @param at The position of the byte. The first byte is at 0. * * @return The byte. * * @throws JSONException if there is no byte at that position. */ public int get(final int at) throws JSONException { if ((at < 0) || (at > length)) { throw new JSONException("Bad character at " + at); } return (bytes[at]) & 0xFF; } /** * Returns a hash code value for the kim. */ @Override public int hashCode() { return hashcode; } /** * Produce a UTF-16 String from this kim. The number of codepoints in the string will not be greater than the number * of bytes in the kim, although it could be less. * * @return The string. A kim memoizes its string representation. * * @throws JSONException if the kim is not valid. */ @Override public String toString() throws JSONException { if (string == null) { int c; int length = 0; final char chars[] = new char[this.length]; for (int at = 0; at < this.length; at += characterSize(c)) { c = characterAt(at); if (c < 0x10000) { chars[length] = (char) c; length += 1; } else { chars[length] = (char) (0xD800 | ((c - 0x10000) >>> 10)); length += 1; chars[length] = (char) (0xDC00 | (c & 0x03FF)); length += 1; } } string = new String(chars, 0, length); } return string; } }