package com.intellectualcrafters.json;
/**
* Kim makes immutable eight bit Unicode strings. If the MSB of a byte is set, then the next byte is a continuation
* byte. The last byte of a character never has the MSB reset. Every byte that is not the last byte has the MSB set. Kim
* stands for "Keep it minimal". A Unicode character is never longer than 3 bytes. Every byte contributes 7 bits to the
* character. ASCII is unmodified.
*
* Kim UTF-8 one byte U+007F U+007F two bytes U+3FFF U+07FF three bytes U+10FFF U+FFFF four bytes U+10FFFF
*
* Characters in the ranges U+0800..U+3FFF and U+10000..U+10FFFF will be one byte smaller when encoded in Kim compared
* to UTF-8.
*
* Kim is beneficial when using scripts such as Old South Arabian, Aramaic, Avestan, Balinese, Batak, Bopomofo,
* Buginese, Buhid, Carian, Cherokee, Coptic, Cyrillic, Deseret, Egyptian Hieroglyphs, Ethiopic, Georgian, Glagolitic,
* Gothic, Hangul Jamo, Hanunoo, Hiragana, Kanbun, Kaithi, Kannada, Katakana, Kharoshthi, Khmer, Lao, Lepcha, Limbu,
* Lycian, Lydian, Malayalam, Mandaic, Meroitic, Miao, Mongolian, Myanmar, New Tai Lue, Ol Chiki, Old Turkic, Oriya,
* Osmanya, Pahlavi, Parthian, Phags-Pa, Phoenician, Samaritan, Sharada, Sinhala, Sora Sompeng, Tagalog, Tagbanwa,
* Takri, Tai Le, Tai Tham, Tamil, Telugu, Thai, Tibetan, Tifinagh, UCAS.
*
* A kim object can be constructed from an ordinary UTF-16 string, or from a byte array. A kim object can produce a
* UTF-16 string.
*
* As with UTF-8, it is possible to detect character boundaries within a byte sequence. UTF-8 is one of the world's
* great inventions. While Kim is more efficient, it is not clear that it is worth the expense of transition.
*
* @version 2013-04-18
*/
public class Kim {
/**
* The number of bytes in the kim. The number of bytes can be as much as three times the number of characters.
*/
public int length = 0;
/**
* The byte array containing the kim's content.
*/
private byte[] bytes = null;
/**
* The kim's hashcode, conforming to Java's hashcode conventions.
*/
private int hashcode = 0;
/**
* The memoization of toString().
*/
private String string = null;
/**
* Make a kim from a portion of a byte array.
*
* @param bytes A byte array.
* @param from The index of the first byte.
* @param thru The index of the last byte plus one.
*/
public Kim(final byte[] bytes, final int from, final int thru) {
// As the bytes are copied into the new kim, a hashcode is computed
// using a
// modified Fletcher code.
int sum = 1;
int value;
hashcode = 0;
length = thru - from;
if (length > 0) {
this.bytes = new byte[length];
for (int at = 0; at < length; at += 1) {
value = bytes[at + from] & 0xFF;
sum += value;
hashcode += sum;
this.bytes[at] = (byte) value;
}
hashcode += sum << 16;
}
}
/**
* Make a kim from a byte array.
*
* @param bytes The byte array.
* @param length The number of bytes.
*/
public Kim(final byte[] bytes, final int length) {
this(bytes, 0, length);
}
/**
* Make a new kim from a substring of an existing kim. The coordinates are in byte units, not character units.
*
* @param kim The source of bytes.
* @param from The point at which to take bytes.
* @param thru The point at which to stop taking bytes.
*/
public Kim(final Kim kim, final int from, final int thru) {
this(kim.bytes, from, thru);
}
/**
* Make a kim from a string.
*
* @param string The string.
*
* @throws JSONException if surrogate pair mismatch.
*/
public Kim(final String string) throws JSONException {
final int stringLength = string.length();
hashcode = 0;
length = 0;
// First pass: Determine the length of the kim, allowing for the UTF-16
// to UTF-32 conversion, and then the UTF-32 to Kim conversion.
if (stringLength > 0) {
for (int i = 0; i < stringLength; i += 1) {
final int c = string.charAt(i);
if (c <= 0x7F) {
length += 1;
} else if (c <= 0x3FFF) {
length += 2;
} else {
if ((c >= 0xD800) && (c <= 0xDFFF)) {
i += 1;
final int d = string.charAt(i);
if ((c > 0xDBFF) || (d < 0xDC00) || (d > 0xDFFF)) {
throw new JSONException("Bad UTF16");
}
}
length += 3;
}
}
// Second pass: Allocate a byte array and fill that array with the
// conversion
// while computing the hashcode.
bytes = new byte[length];
int at = 0;
int b;
int sum = 1;
for (int i = 0; i < stringLength; i += 1) {
int character = string.charAt(i);
if (character <= 0x7F) {
bytes[at] = (byte) character;
sum += character;
hashcode += sum;
at += 1;
} else if (character <= 0x3FFF) {
b = 0x80 | (character >>> 7);
bytes[at] = (byte) b;
sum += b;
hashcode += sum;
at += 1;
b = character & 0x7F;
bytes[at] = (byte) b;
sum += b;
hashcode += sum;
at += 1;
} else {
if ((character >= 0xD800) && (character <= 0xDBFF)) {
i += 1;
character = (((character & 0x3FF) << 10) | (string.charAt(i) & 0x3FF)) + 65536;
}
b = 0x80 | (character >>> 14);
bytes[at] = (byte) b;
sum += b;
hashcode += sum;
at += 1;
b = 0x80 | ((character >>> 7) & 0xFF);
bytes[at] = (byte) b;
sum += b;
hashcode += sum;
at += 1;
b = character & 0x7F;
bytes[at] = (byte) b;
sum += b;
hashcode += sum;
at += 1;
}
}
hashcode += sum << 16;
}
}
/**
* Returns the number of bytes needed to contain the character in Kim format.
*
* @param character a Unicode character between 0 and 0x10FFFF.
*
* @return 1, 2, or 3
*
* @throws JSONException if the character is not representable in a kim.
*/
public static int characterSize(final int character) throws JSONException {
if ((character < 0) || (character > 0x10FFFF)) {
throw new JSONException("Bad character " + character);
}
return character <= 0x7F ? 1 : character <= 0x3FFF ? 2 : 3;
}
/**
* Returns the character at the specified index. The index refers to byte values and ranges from 0 to length - 1.
* The index of the next character is at index + Kim.characterSize(kim.characterAt(index)).
*
* @param at the index of the char value. The first character is at 0.
*
* @throws JSONException if at does not point to a valid character.
* @return a Unicode character between 0 and 0x10FFFF.
*/
public int characterAt(final int at) throws JSONException {
final int c = get(at);
if ((c & 0x80) == 0) {
return c;
}
int character;
final int c1 = get(at + 1);
if ((c1 & 0x80) == 0) {
character = ((c & 0x7F) << 7) | c1;
if (character > 0x7F) {
return character;
}
} else {
final int c2 = get(at + 2);
character = ((c & 0x7F) << 14) | ((c1 & 0x7F) << 7) | c2;
if (((c2 & 0x80) == 0) && (character > 0x3FFF) && (character <= 0x10FFFF) && ((character < 0xD800) || (character > 0xDFFF))) {
return character;
}
}
throw new JSONException("Bad character at " + at);
}
/**
* Copy the contents of this kim to a byte array.
*
* @param bytes A byte array of sufficient size.
* @param at The position within the byte array to take the byes.
*
* @return The position immediately after the copy.
*/
public int copy(final byte[] bytes, final int at) {
System.arraycopy(this.bytes, 0, bytes, at, length);
return at + length;
}
/**
* Two kim objects containing exactly the same bytes in the same order are equal to each other.
*
* @param obj the other kim with which to compare.
*
* @return true if this and obj are both kim objects containing identical byte sequences.
*/
@Override
public boolean equals(final Object obj) {
if (!(obj instanceof Kim)) {
return false;
}
final Kim that = (Kim) obj;
if (this == that) {
return true;
}
if (hashcode != that.hashcode) {
return false;
}
return java.util.Arrays.equals(bytes, that.bytes);
}
/**
* Get a byte from a kim.
*
* @param at The position of the byte. The first byte is at 0.
*
* @return The byte.
*
* @throws JSONException if there is no byte at that position.
*/
public int get(final int at) throws JSONException {
if ((at < 0) || (at > length)) {
throw new JSONException("Bad character at " + at);
}
return (bytes[at]) & 0xFF;
}
/**
* Returns a hash code value for the kim.
*/
@Override
public int hashCode() {
return hashcode;
}
/**
* Produce a UTF-16 String from this kim. The number of codepoints in the string will not be greater than the number
* of bytes in the kim, although it could be less.
*
* @return The string. A kim memoizes its string representation.
*
* @throws JSONException if the kim is not valid.
*/
@Override
public String toString() throws JSONException {
if (string == null) {
int c;
int length = 0;
final char chars[] = new char[this.length];
for (int at = 0; at < this.length; at += characterSize(c)) {
c = characterAt(at);
if (c < 0x10000) {
chars[length] = (char) c;
length += 1;
} else {
chars[length] = (char) (0xD800 | ((c - 0x10000) >>> 10));
length += 1;
chars[length] = (char) (0xDC00 | (c & 0x03FF));
length += 1;
}
}
string = new String(chars, 0, length);
}
return string;
}
}