package yuku.alkitab.io; import yuku.bintex.BintexReader; import java.io.IOException; /** * This class is an intentionally incomplete UTF-8 decoder that only supports up to U+FFFF. */ public class Utf8Decoder { public static char[] buf = new char[1000]; static ThreadLocal<byte[]> byte_buf_ = new ThreadLocal<byte[]>() { @Override protected byte[] initialValue() { return new byte[1000]; } }; static ThreadLocal<char[]> char_buf_ = new ThreadLocal<char[]>() { @Override protected char[] initialValue() { return new char[8000]; } }; public static String toString(byte[] ba) { return toString(ba, 0, ba.length); } public static String toString(byte[] ba, int start, int length) { if (buf.length < ba.length) { buf = new char[ba.length + 1000]; } int pos = 0; try { for (int i = start; i < start + length; i++) { int c0 = ba[i] & 0xff; if (c0 < 0x80) { // input 1 byte, output 7 bit buf[pos++] = (char) c0; continue; } i++; int c1 = ba[i] & 0xff; if (c0 < 0xe0) { // input 2 byte, output 5+6 = 11 bit buf[pos++] = (char) (((c0 & 0x1f) << 6) | (c1 & 0x3f)); continue; } i++; int c2 = ba[i] & 0xff; // input 3 byte, output 4+6+6 = 16 bit buf[pos++] = (char) (((c0 & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f)); } } catch (ArrayIndexOutOfBoundsException e) { // biarin } return new String(buf, 0, pos); } public static String toStringLowerCase(byte[] ba) { return toStringLowerCase(ba, 0, ba.length); } public static String toStringLowerCase(byte[] ba, int start, int length) { if (buf.length < ba.length) { buf = new char[ba.length + 1000]; } int pos = 0; try { for (int i = start; i < start + length; i++) { int c0 = ba[i] & 0xff; if (c0 < 0x80) { // input 1 byte, output 7 bit if (c0 >= 'A' && c0 <= 'Z') { buf[pos++] = (char) (c0 | 0x20); } else { buf[pos++] = (char) c0; } continue; } i++; int c1 = ba[i] & 0xff; if (c0 < 0xe0) { // input 2 byte, output 5+6 = 11 bit final int c = ((c0 & 0x1f) << 6) | (c1 & 0x3f); buf[pos++] = (char) Character.toLowerCase(c); continue; } i++; int c2 = ba[i] & 0xff; { // input 3 byte, output 4+6+6 = 16 bit final int c = ((c0 & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f); buf[pos++] = (char) Character.toLowerCase(c); } } } catch (ArrayIndexOutOfBoundsException e) { // biarin } return new String(buf, 0, pos); } public static String toStringFromVersesWithPrependedLengths(BintexReader br, int verse_count, boolean lowercased) throws IOException { byte[] byte_buf = byte_buf_.get(); char[] char_buf = char_buf_.get(); int char_pos = 0; for (int v = 0; v < verse_count; v++) { int verse_len = br.readVarUint(); if (verse_len > byte_buf.length) { byte_buf = new byte[verse_len + 100]; byte_buf_.set(byte_buf); } int will_need_char_len = char_pos + verse_len + 1 /*for separator*/; if (will_need_char_len > char_buf.length) { final char[] new_char_buf = new char[will_need_char_len + 1000]; System.arraycopy(char_buf, 0, new_char_buf, 0, char_buf.length); char_buf = new_char_buf; char_buf_.set(char_buf); } br.readRaw(byte_buf, 0, verse_len); for (int i = 0; i < verse_len; i++) { int c0 = byte_buf[i] & 0xff; if (c0 < 0x80) { // input 1 byte, output 7 bit if (lowercased && (c0 >= 'A' && c0 <= 'Z')) { char_buf[char_pos++] = (char) (c0 | 0x20); } else { char_buf[char_pos++] = (char) c0; } continue; } i++; int c1 = byte_buf[i] & 0xff; if (c0 < 0xe0) { // input 2 byte, output 5+6 = 11 bit final int c = (((c0 & 0x1f) << 6) | (c1 & 0x3f)); if (!lowercased) { char_buf[char_pos++] = (char) c; } else { char_buf[char_pos++] = (char) Character.toLowerCase(c); } continue; } i++; int c2 = byte_buf[i] & 0xff; { // input 3 byte, output 4+6+6 = 16 bit final int c = ((c0 & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f); if (!lowercased) { char_buf[char_pos++] = (char) c; } else { char_buf[char_pos++] = (char) Character.toLowerCase(c); } } } // verse separator char_buf[char_pos++] = '\n'; } return new String(char_buf, 0, char_pos); } }