/*
* Copyright 2000 Finn Bock
*
* This program contains material copyrighted by:
* Copyright (c) Corporation for National Research Initiatives.
* Originally written by Marc-Andre Lemburg (mal@lemburg.com).
*/
package org.python.core;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import org.python.core.util.StringUtil;
/**
* Contains the implementation of the builtin codecs.
* @since Jython 2.0
*/
public class codecs {
public static final String BACKSLASHREPLACE = "backslashreplace";
public static final String IGNORE = "ignore";
public static final String REPLACE = "replace";
public static final String XMLCHARREFREPLACE = "xmlcharrefreplace";
private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
private static PyList searchPath;
private static PyStringMap searchCache;
private static PyStringMap errorHandlers;
/** Used to synchronize registry_init. */
private static final Object INIT_LOCK = new Object();
private static String default_encoding = "ascii";
public static String getDefaultEncoding() {
return default_encoding;
}
public static void setDefaultEncoding(String encoding) {
lookup(encoding);
default_encoding = encoding;
}
public static PyObject lookup_error(String handlerName) {
registry_init();
if (handlerName == null) {
handlerName = "strict";
}
PyObject handler = errorHandlers.__finditem__(handlerName.intern());
if (handler == null) {
throw new PyException(Py.LookupError,
"unknown error handler name '" + handlerName + "'");
}
return handler;
}
public static void register_error(String name, PyObject error) {
registry_init();
if (!error.isCallable()) {
throw Py.TypeError("argument must be callable");
}
errorHandlers.__setitem__(name.intern(), error);
}
public static void register(PyObject search_function) {
registry_init();
if (!search_function.isCallable()) {
throw Py.TypeError("argument must be callable");
}
searchPath.append(search_function);
}
public static PyTuple lookup(String encoding) {
registry_init();
PyString v = new PyString(normalizestring(encoding));
PyObject cached = searchCache.__finditem__(v);
if (cached != null) {
return (PyTuple)cached;
}
if (searchPath.__len__() == 0) {
throw new PyException(Py.LookupError,
"no codec search functions registered: can't find encoding '" + encoding + "'");
}
for (PyObject func : searchPath.asIterable()) {
PyObject created = func.__call__(v);
if (created == Py.None) {
continue;
}
if (!(created instanceof PyTuple) || created.__len__() != 4) {
throw Py.TypeError("codec search functions must return 4-tuples");
}
searchCache.__setitem__(v, created);
return (PyTuple)created;
}
throw new PyException(Py.LookupError, "unknown encoding '" + encoding + "'");
}
private static String normalizestring(String string) {
return string.toLowerCase().replace(' ', '-');
}
private static boolean import_encodings_called;
private static void import_encodings() {
if (!import_encodings_called) {
import_encodings_called = true;
try {
imp.load("encodings");
} catch (PyException exc) {
if (exc.type != Py.ImportError) {
throw exc;
}
}
}
}
public static PyObject decode(PyString v, String encoding, String errors) {
if (encoding == null) {
encoding = getDefaultEncoding();
} else {
encoding = normalizestring(encoding);
}
if (errors != null) {
errors = errors.intern();
}
/* Shortcut for ascii encoding */
if (encoding.equals("ascii")) {
return wrapDecodeResult(PyUnicode_DecodeASCII(v.toString(), v.__len__(), errors));
}
/* Decode via the codec registry */
PyObject decoder;
try {
decoder = lookup(encoding).__getitem__(1);
} catch (PyException ex) {
if (ex.match(Py.LookupError)) {
// If we couldn't find an encoding, see if we have a builtin
if (encoding.equals("utf-8")) {
return wrapDecodeResult(PyUnicode_DecodeUTF8(v.toString(), errors));
} else if(encoding.equals("utf-7")) {
return wrapDecodeResult(PyUnicode_DecodeUTF7(v.toString(), errors));
} else if(encoding.equals("latin-1")) {
return wrapDecodeResult(PyUnicode_DecodeLatin1(v.toString(), v.__len__(),
errors));
}
}
throw ex;
}
PyObject result;
if (errors != null) {
result = decoder.__call__(v, new PyString(errors));
} else {
result = decoder.__call__(v);
}
if (!(result instanceof PyTuple) || result.__len__() != 2) {
throw Py.TypeError("decoder must return a tuple (object,integer)");
}
return result.__getitem__(0);
}
private static PyUnicode wrapDecodeResult(String result) {
return new PyUnicode(result, true);
}
public static String encode(PyString v, String encoding,
String errors) {
if (encoding == null) {
encoding = getDefaultEncoding();
} else {
encoding = normalizestring(encoding);
}
if (errors != null) {
errors = errors.intern();
}
/* Shortcuts for common default encodings. latin-1 must not use the
* lookup registry for the encodings module to work correctly */
if (encoding.equals("latin-1")) {
return PyUnicode_EncodeLatin1(v.toString(), v.__len__(), errors);
} else if (encoding.equals("ascii")) {
return PyUnicode_EncodeASCII(v.toString(), v.__len__(), errors);
}
/* Encode via the codec registry */
PyObject encoder;
try {
encoder = lookup(encoding).__getitem__(0);
} catch (PyException ex) {
if (ex.match(Py.LookupError)) {
// If we couldn't find an encoding, see if we have a builtin
if (encoding.equals("utf-8")) {
return PyUnicode_EncodeUTF8(v.toString(), errors);
} else if(encoding.equals("utf-7")) {
return codecs.PyUnicode_EncodeUTF7(v.toString(), false, false, errors);
}
}
throw ex;
}
PyObject result;
if (errors != null) {
result = encoder.__call__(v, new PyString(errors));
} else {
result = encoder.__call__(v);
}
if (!(result instanceof PyTuple) || result.__len__() != 2) {
throw Py.TypeError("encoder must return a tuple (object,integer)");
}
PyObject encoded = result.__getitem__(0);
if (encoded instanceof PyString) {
return encoded.toString();
} else {
throw Py.TypeError("encoder did not return a string/unicode object (type="
+ encoded.getType().fastGetName() + ")");
}
}
public static PyObject strict_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("strict_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (Py.isInstance(exc, Py.UnicodeDecodeError)) {
throw new PyException(Py.UnicodeDecodeError, exc);
} else if (Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw new PyException(Py.UnicodeEncodeError, exc);
} else if (Py.isInstance(exc, Py.UnicodeTranslateError)) {
throw new PyException(Py.UnicodeTranslateError, exc);
}
throw wrong_exception_type(exc);
}
public static PyObject ignore_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("ignore_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (!isUnicodeError(exc)) {
throw wrong_exception_type(exc);
}
PyObject end = exc.__getattr__("end");
return new PyTuple(Py.java2py(""), end);
}
private static boolean isUnicodeError(PyObject exc) {
return Py.isInstance(exc, Py.UnicodeDecodeError) ||
Py.isInstance(exc, Py.UnicodeEncodeError) ||
Py.isInstance(exc, Py.UnicodeTranslateError);
}
public static PyObject replace_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("replace_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (Py.isInstance(exc, Py.UnicodeEncodeError)) {
int end = exceptions.getEnd(exc, true);
return new PyTuple(new PyUnicode("?"), Py.newInteger(end));
} else if (Py.isInstance(exc, Py.UnicodeDecodeError)) {
int end = exceptions.getEnd(exc, false);
return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER),
Py.newInteger(end));
} else if (Py.isInstance(exc, Py.UnicodeTranslateError)) {
int end = exceptions.getEnd(exc, true);
return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER),
Py.newInteger(end));
}
throw wrong_exception_type(exc);
}
public static PyObject xmlcharrefreplace_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("xmlcharrefreplace_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw wrong_exception_type(exc);
}
int start = ((PyInteger) exc.__getattr__("start")).getValue();
int end = ((PyInteger) exc.__getattr__("end")).getValue();
String object = exc.__getattr__("object").toString();
StringBuilder replacement = new StringBuilder();
xmlcharrefreplace_internal(start, end, object, replacement);
return new PyTuple(Py.java2py(replacement.toString()), exc.__getattr__("end"));
}
public static StringBuilder xmlcharrefreplace(int start, int end, String toReplace) {
StringBuilder replacement = new StringBuilder();
xmlcharrefreplace_internal(start, end, toReplace, replacement);
return replacement;
}
private static void xmlcharrefreplace_internal(int start, int end, String object, StringBuilder replacement) {
for (int i = start; i < end; i++) {
replacement.append("");
char cur = object.charAt(i);
int digits;
int base;
if (cur < 10) {
digits = 1;
base = 1;
} else if (cur < 100) {
digits = 2;
base = 10;
} else if (cur < 1000) {
digits = 3;
base = 100;
} else if (cur < 10000) {
digits = 4;
base = 1000;
} else if (cur < 100000) {
digits = 5;
base = 10000;
} else if (cur < 1000000) {
digits = 6;
base = 100000;
} else {
digits = 7;
base = 1000000;
}
while (digits-- > 0) {
replacement.append((char) ('0' + cur / base));
cur %= base;
base /= 10;
}
replacement.append(';');
}
}
private static PyException wrong_exception_type(PyObject exc) {
PyObject excClass = exc.__getattr__("__class__");
PyObject className = excClass.__getattr__("__name__");
return new PyException(Py.TypeError, "Don't know how to handle " + className + " in error callback");
}
static char hexdigits[] = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};
public static PyObject backslashreplace_errors(PyObject[] args, String[] kws) {
ArgParser ap = new ArgParser("backslashreplace_errors", args, kws, "exc");
PyObject exc = ap.getPyObject(0);
if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
throw wrong_exception_type(exc);
}
int start = ((PyInteger) exc.__getattr__("start")).getValue();
int end = ((PyInteger) exc.__getattr__("end")).getValue();
String object = exc.__getattr__("object").toString();
StringBuilder replacement = new StringBuilder();
backslashreplace_internal(start, end, object, replacement);
return new PyTuple(Py.java2py(replacement.toString()), exc.__getattr__("end"));
}
public static StringBuilder backslashreplace(int start, int end, String toReplace) {
StringBuilder replacement = new StringBuilder();
backslashreplace_internal(start, end, toReplace, replacement);
return replacement;
}
private static void backslashreplace_internal(int start, int end, String object, StringBuilder replacement) {
for (Iterator<Integer> iter = new StringSubsequenceIterator(object, start, end, 1); iter.hasNext();) {
int c = iter.next();
replacement.append('\\');
if (c >= 0x00010000) {
replacement.append('U');
replacement.append(hexdigits[(c >> 28) & 0xf]);
replacement.append(hexdigits[(c >> 24) & 0xf]);
replacement.append(hexdigits[(c >> 20) & 0xf]);
replacement.append(hexdigits[(c >> 16) & 0xf]);
replacement.append(hexdigits[(c >> 12) & 0xf]);
replacement.append(hexdigits[(c >> 8) & 0xf]);
} else if (c >= 0x100) {
replacement.append('u');
replacement.append(hexdigits[(c >> 12) & 0xf]);
replacement.append(hexdigits[(c >> 8) & 0xf]);
} else {
replacement.append('x');
}
replacement.append(hexdigits[(c >> 4) & 0xf]);
replacement.append(hexdigits[c & 0xf]);
}
}
private static void registry_init() {
synchronized (INIT_LOCK) {
if (searchPath != null) {
return;
}
searchPath = new PyList();
searchCache = new PyStringMap();
errorHandlers = new PyStringMap();
String[] builtinErrorHandlers = new String[]{"strict",
IGNORE,
REPLACE,
XMLCHARREFREPLACE,
BACKSLASHREPLACE
};
for (String builtinErrorHandler : builtinErrorHandlers) {
register_error(builtinErrorHandler, Py.newJavaFunc(codecs.class,
builtinErrorHandler + "_errors"));
}
import_encodings();
}
}
/* --- UTF-7 Codec -------------------------------------------------------- */
/* see RFC2152 for details */
public static char utf7_special[] = {
/*
* indicate whether a UTF-7 character is special i.e. cannot be directly
* encoded: 0 - not special 1 - special 2 - whitespace (optional) 3 -
* RFC2152 Set O (optional)
*/
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1
,
};
private static boolean SPECIAL(char c, boolean encodeO, boolean encodeWS){
return (c>127 || utf7_special[(c)] == 1) ||
(encodeWS && (utf7_special[(c)] == 2)) ||
(encodeO && (utf7_special[(c)] == 3));
}
private static final String B64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
private static char B64(int n) {
return B64_CHARS.charAt(n & 0x3f);
}
private static boolean B64CHAR(char c) {
return B64_CHARS.indexOf(c) != -1;
}
private static int UB64(char c) {
return ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4);
}
// note that we follow CPython 2.5 exactly here - it does not support surrogates,
// but has to process as-if they are there for replacement purposes
// fortunately no one really cares about utf-7
public static String PyUnicode_DecodeUTF7(String str, String errors) {
int s = 0;
int e = str.length();
boolean inShift = false;
int bitsInCharsleft = 0;
long charsleft = 0;
boolean surrogate = false;
StringBuilder unicode = new StringBuilder(e);
while (s < e) {
// restart:
char ch = str.charAt(s);
if (inShift) {
if ((ch == '-') || !B64CHAR(ch)) {
inShift = false;
s++;
while (bitsInCharsleft >= 16) {
bitsInCharsleft -= 16;
char outCh = (char) ((charsleft >> bitsInCharsleft) & 0xffff);
if (surrogate) {
s = codecs.insertReplacementAndGetResume(unicode,
errors,
"utf-7",
str,
s,
s + 1,
"code pairs are not supported");
surrogate = false;
} else if (0xDC00 <= outCh && outCh <= 0xDFFF) {
surrogate = true;
} else {
unicode.append(outCh);
}
}
if (bitsInCharsleft >= 6) {
/*
* The shift sequence has a partial character in it. If
* bitsleft < 6 then we could just classify it as
* padding but that is not the case here
*/
s = insertReplacementAndGetResume(unicode,
errors,
"utf-7",
str,
s,
s + 1,
"partial character in shift sequence");
}
/*
* According to RFC2152 the remaining bits should be zero.
* We choose to signal an error/insert a replacement
* character here so indicate the potential of a misencoded
* character.
*/
if (bitsInCharsleft > 0 && ((charsleft << 5 - bitsInCharsleft) & 0x1f) > 0) {
s = insertReplacementAndGetResume(unicode,
errors,
"utf-7",
str,
s,
s + 1,
"non-zero padding bits in shift sequence");
}
if (ch == '-') {
if ((s < e) && (str.charAt(s) == '-')) {
unicode.append('-');
inShift = true;
}
} else if (SPECIAL(ch, false, false)) {
s = insertReplacementAndGetResume(unicode,
errors,
"utf-7",
str,
s,
s + 1,
"unexpected special character");
} else {
unicode.append(ch);
}
} else {
charsleft = (charsleft << 6) | UB64(ch);
bitsInCharsleft += 6;
s++;
while (bitsInCharsleft >= 16) {
bitsInCharsleft -= 16;
char outCh = (char) ((charsleft >> bitsInCharsleft) & 0xffff);
if (surrogate) {
s = codecs.insertReplacementAndGetResume(unicode,
errors,
"utf-7",
str,
s,
s + 1,
"code pairs are not supported");
} else if (0xDC00 <= outCh && outCh <= 0xDFFF) {
surrogate = true;
} else {
unicode.append(outCh);
}
}
}
} else if (ch == '+') {
s++;
if (s < e && str.charAt(s) == '-') {
s++;
unicode.append('+');
} else {
inShift = true;
bitsInCharsleft = 0;
}
} else if (SPECIAL(ch, false, false)) {
s = insertReplacementAndGetResume(unicode,
errors,
"utf-7",
str,
s,
s + 1,
"unexpected special character");
} else {
unicode.append(ch);
s++;
}
if (inShift && s == e) {
s = insertReplacementAndGetResume(unicode,
errors,
"utf-7",
str,
s,
s,
"unterminated shift sequence");
}
}
return unicode.toString();
}
public static String PyUnicode_EncodeUTF7(String str,
boolean encodeSetO,
boolean encodeWhiteSpace,
String errors) {
int size = str.length();
if (size == 0) {
return "";
}
boolean inShift = false;
int bitsleft = 0;
int charsleft = 0;
StringBuilder v = new StringBuilder();
for (int i = 0; i < size; ++i) {
char ch = str.charAt(i);
if (!inShift) {
if (ch == '+') {
v.append('+');
v.append('-');
} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
charsleft = ch;
bitsleft = 16;
v.append('+');
while (bitsleft >= 6) {
v.append(B64(charsleft >> (bitsleft - 6)));
bitsleft -= 6;
}
inShift = bitsleft > 0;
} else {
v.append(ch);
}
} else {
if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
v.append(B64(charsleft << (6 - bitsleft)));
charsleft = 0;
bitsleft = 0;
/* Characters not in the BASE64 set implicitly unshift the sequence
so no '-' is required, except if the character is itself a '-' */
if (B64CHAR(ch) || ch == '-') {
v.append('-');
}
inShift = false;
v.append(ch);
} else {
bitsleft += 16;
charsleft = (charsleft << 16) | ch;
while (bitsleft >= 6) {
v.append(B64(charsleft >> (bitsleft - 6)));
bitsleft -= 6;
}
/* If the next character is special then we dont' need to terminate
the shift sequence. If the next character is not a BASE64 character
or '-' then the shift sequence will be terminated implicitly and we
don't have to insert a '-'. */
if (bitsleft == 0) {
if (i + 1 < size) {
char ch2 = str.charAt(i + 1);
if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
} else if (B64CHAR(ch2) || ch2 == '-') {
v.append('-');
inShift = false;
} else {
inShift = false;
}
} else {
v.append('-');
inShift = false;
}
}
}
}
}
if (bitsleft > 0) {
v.append(B64(charsleft << (6 - bitsleft)));
v.append('-');
}
return v.toString();
}
/* --- UTF-8 Codec ---------------------------------------------------- */
private static byte utf8_code_length[] = {
/* Map UTF-8 encoded prefix byte to sequence length. zero means
illegal prefix. see RFC 2279 for details */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
};
// TODO: need to modify to use a codepoint approach (which is almost the case now,
// ch is an
public static String PyUnicode_DecodeUTF8(String str, String errors) {
return PyUnicode_DecodeUTF8Stateful(str, errors, null);
}
public static String PyUnicode_DecodeUTF8Stateful(String str, String errors, int[] consumed) {
int size = str.length();
StringBuilder unicode = new StringBuilder(size);
/* Unpack UTF-8 encoded data */
int i;
for (i = 0; i < size;) {
int ch = str.charAt(i);
if (ch < 0x80) {
unicode.append((char) ch);
i++;
continue;
}
if (ch > 0xFF) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "ordinal not in range(255)");
continue;
}
int n = utf8_code_length[ch];
if (i + n > size) {
if (consumed != null) {
break;
}
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected end of data");
continue;
}
switch (n) {
case 0:
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected code byte");
continue;
case 1:
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "internal error");
continue;
case 2:
char ch1 = str.charAt(i + 1);
if ((ch1 & 0xc0) != 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "invalid data");
continue;
}
ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
if (ch < 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "illegal encoding");
continue;
} else {
unicode.appendCodePoint(ch);
}
break;
case 3:
ch1 = str.charAt(i + 1);
char ch2 = str.charAt(i + 2);
if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "invalid data");
continue;
}
ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "illegal encoding");
continue;
} else {
unicode.appendCodePoint(ch);
}
break;
case 4:
ch1 = str.charAt(i + 1);
ch2 = str.charAt(i + 2);
char ch3 = str.charAt(i + 3);
if ((ch1 & 0xc0) != 0x80 ||
(ch2 & 0xc0) != 0x80 ||
(ch3 & 0xc0) != 0x80) {
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "invalid data");
continue;
}
ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) || /* minimum value allowed for 4
byte encoding */
(ch > 0x10ffff)) { /* maximum value allowed for
UTF-16 */
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "illegal encoding");
continue;
}
unicode.appendCodePoint(ch);
break;
default:
// TODO: support
/* Other sizes are only needed for UCS-4 */
i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + n, "unsupported Unicode code range");
continue;
}
i += n;
}
if (consumed != null) {
consumed[0] = i;
}
return unicode.toString();
}
public static String PyUnicode_EncodeUTF8(String str, String errors) {
return StringUtil.fromBytes(Charset.forName("UTF-8").encode(str));
}
public static String PyUnicode_DecodeASCII(String str, int size, String errors) {
return PyUnicode_DecodeIntLimited(str, size, errors, "ascii", 128);
}
public static String PyUnicode_DecodeLatin1(String str, int size, String errors) {
return PyUnicode_DecodeIntLimited(str, size, errors, "latin-1", 256);
}
private static String PyUnicode_DecodeIntLimited(String str, int size, String errors, String encoding, int limit) {
StringBuilder v = new StringBuilder(size);
String reason = "ordinal not in range(" + limit + ")";
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
if (ch < limit) {
v.append(ch);
} else {
i = insertReplacementAndGetResume(v, errors,
encoding,
str,
i,
i + 1,
reason) - 1;
}
}
return v.toString();
}
public static String PyUnicode_EncodeASCII(String str, int size,
String errors) {
return PyUnicode_EncodeIntLimited(str, size, errors, "ascii", 128);
}
public static String PyUnicode_EncodeLatin1(String str, int size,
String errors) {
return PyUnicode_EncodeIntLimited(str, size, errors, "latin-1", 256);
}
private static String PyUnicode_EncodeIntLimited(String str, int size,
String errors, String encoding, int limit) {
String reason = "ordinal not in range(" + limit + ")";
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size; i++) {
char ch = str.charAt(i);
if (ch >= limit) {
int nextGood = i + 1;
for (; nextGood < size; nextGood++) {
if (str.charAt(nextGood) < limit) {
break;
}
}
if (errors != null) {
if (errors.equals(IGNORE)) {
i = nextGood - 1;
continue;
} else if (errors.equals(REPLACE)) {
for (int j = i; j < nextGood; j++) {
v.append('?');
}
i = nextGood - 1;
continue;
} else if (errors.equals(XMLCHARREFREPLACE)) {
v.append(xmlcharrefreplace(i, nextGood, str));
i = nextGood - 1;
continue;
} else if (errors.equals(BACKSLASHREPLACE)) {
v.append(backslashreplace(i, nextGood, str));
i = nextGood - 1;
continue;
}
}
PyObject replacement = encoding_error(errors,
encoding,
str,
i,
nextGood,
reason);
String replStr = replacement.__getitem__(0).toString();
for (int j = 0; j < replStr.length(); j++) {
if (replStr.charAt(j) >= limit) {
throw Py.UnicodeEncodeError(encoding, str, i + j, i + j + 1, reason);
}
}
v.append(replStr);
i = calcNewPosition(size, replacement) - 1;
} else {
v.append(ch);
}
}
return v.toString();
}
public static int calcNewPosition(int size, PyObject errorTuple) {
int newPosition = ((PyInteger) errorTuple.__getitem__(1)).getValue();
if (newPosition < 0) {
newPosition = size + newPosition;
}
if (newPosition > size || newPosition < 0) {
throw Py.IndexError(newPosition + " out of bounds of encoded string");
}
return newPosition;
}
/* --- RawUnicodeEscape Codec ---------------------------------------- */
private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
// The modified flag is used by cPickle.
public static String PyUnicode_EncodeRawUnicodeEscape(String str, String errors,
boolean modifed) {
StringBuilder v = new StringBuilder(str.length());
for (Iterator<Integer> iter = new PyUnicode(str).newSubsequenceIterator();
iter.hasNext();) {
int codePoint = iter.next();
if (codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// Map 32-bit characters to '\\Uxxxxxxxx'
v.append("\\U");
v.append(hexdigit[(codePoint >> 28) & 0xF]);
v.append(hexdigit[(codePoint >> 24) & 0xF]);
v.append(hexdigit[(codePoint >> 20) & 0xF]);
v.append(hexdigit[(codePoint >> 16) & 0xF]);
v.append(hexdigit[(codePoint >> 12) & 0xF]);
v.append(hexdigit[(codePoint >> 8) & 0xF]);
v.append(hexdigit[(codePoint >> 4) & 0xF]);
v.append(hexdigit[codePoint & 0xF]);
} else if (codePoint >= 256 || (modifed && (codePoint == '\\' || codePoint == '\n'))) {
// Map 16-bit chararacters to '\\uxxxx'
v.append("\\u");
v.append(hexdigit[(codePoint >> 12) & 0xF]);
v.append(hexdigit[(codePoint >> 8) & 0xF]);
v.append(hexdigit[(codePoint >> 4) & 0xF]);
v.append(hexdigit[codePoint & 0xF]);
} else {
v.append((char)codePoint);
}
}
return v.toString();
}
public static String PyUnicode_DecodeRawUnicodeEscape(String str, String errors) {
int size = str.length();
StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size;) {
char ch = str.charAt(i);
// Non-escape characters are interpreted as Unicode ordinals
if (ch != '\\') {
v.append(ch);
i++;
continue;
}
// \\u-escapes are only interpreted if the number of leading backslashes is
// odd
int bs = i;
while (i < size) {
ch = str.charAt(i);
if (ch != '\\') {
break;
}
v.append(ch);
i++;
}
if (((i - bs) & 1) == 0 || i >= size || (ch != 'u' && ch != 'U')) {
continue;
}
v.setLength(v.length() - 1);
int count = ch == 'u' ? 4 : 8;
i++;
// \\uXXXX with 4 hex digits, \Uxxxxxxxx with 8
int codePoint = 0, asDigit = -1;
for (int j = 0; j < count; i++, j++) {
if (i == size) {
// EOF in a truncated escape
asDigit = -1;
break;
}
ch = str.charAt(i);
asDigit = Character.digit(ch, 16);
if (asDigit == -1) {
break;
}
codePoint = ((codePoint << 4) & ~0xF) + asDigit;
}
if (asDigit == -1) {
i = codecs.insertReplacementAndGetResume(v, errors, "rawunicodeescape", str, bs, i,
"truncated \\uXXXX");
} else {
v.appendCodePoint(codePoint);
}
}
return v.toString();
}
private static class Punycode {
// specified by punycode, http://www.ietf.org/rfc/rfc3492.txt
private static final int BASE = 36;
private static final int TMIN = 1;
private static final int TMAX = 26;
private static final int SKEW = 38;
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 128;
private static final int BASIC = 0x80;
private Punycode() {
}
private static int adapt(int delta, int numpoints, boolean firsttime) {
delta = firsttime ? delta / DAMP : delta >> 1;
delta += delta / numpoints;
int k = 0;
while (delta > (((BASE - TMIN) * TMAX) / 2)) {
delta /= BASE - TMIN;
k += BASE;
}
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
}
private static boolean isBasic(int codePoint) {
return codePoint < BASIC;
}
}
public static String PyUnicode_EncodePunycode(PyUnicode input,
String errors) {
int n = Punycode.INITIAL_N;
int delta = 0;
long guard_delta;
int bias = Punycode.INITIAL_BIAS;
int b = 0;
final StringBuilder buffer = new StringBuilder();
for (Iterator<Integer> iter = input.iterator(); iter.hasNext();) {
int c = iter.next();
if (Punycode.isBasic(c)) {
buffer.appendCodePoint(c);
b++;
}
}
if (b > 0) {
buffer.appendCodePoint('-');
}
int h = b;
int size = input.getCodePointCount();
while (h < size) {
int m = Integer.MAX_VALUE;
int i = 0;
int codePointIndex = 0;
for (Iterator<Integer> iter = input.iterator(); iter.hasNext(); i++) {
int c = iter.next();
if (c > n && c < m) {
m = c;
codePointIndex = i;
}
}
guard_delta = delta + ((m - n) * (h + 1));
if (guard_delta > Integer.MAX_VALUE) {
throw Py.UnicodeEncodeError("punycode", input.getString(), codePointIndex, codePointIndex + 1, "overflow");
}
delta = (int) guard_delta;
n = m;
i = 0;
for (Iterator<Integer> iter = input.iterator(); iter.hasNext(); i++) {
int c = iter.next();
if (c < n) {
guard_delta = delta + 1;
if (guard_delta > Integer.MAX_VALUE) {
throw Py.UnicodeEncodeError("punycode", input.getString(), i, i + 1, "overflow");
}
delta = (int) guard_delta;
}
if (c == n) {
int q = delta;
for (int k = Punycode.BASE;; k += Punycode.BASE) {
int t = k <= bias ? Punycode.TMIN : (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
if (q < t) {
break;
}
buffer.appendCodePoint(t + ((q - t) % (Punycode.BASE - t)));
q = (q - t) / (Punycode.BASE - t);
}
buffer.appendCodePoint(q);
bias = Punycode.adapt(delta, h + 1, h == b);
delta = 0;
h++;
}
}
delta++;
n++;
}
return buffer.toString();
}
public static PyUnicode PyUnicode_DecodePunycode(String input, String errors) {
int input_size = input.length();
int output_size = 0;
ArrayList<Integer> ucs4 = new ArrayList<Integer>(input_size);
int j = 0;
for (; j < input_size; j++) {
int c = input.charAt(j);
if (!Punycode.isBasic(c)) {
throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "not basic");
} else if (c == '-') {
break;
} else {
ucs4.add(c);
output_size++;
}
}
int n = Punycode.INITIAL_N;
int i = 0;
int bias = Punycode.INITIAL_BIAS;
while (j < input_size) {
int old_i = i;
int w = 1;
for (int k = Punycode.BASE;; k += Punycode.BASE) {
int c = input.charAt(j++);
int digit = c - '0';
long guard_i = i + digit * w;
if (guard_i > Integer.MAX_VALUE) {
throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "overflow");
}
i = (int) guard_i;
int t = k <= bias ? Punycode.TMIN : (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
if (digit < t) {
break;
}
long guard_w = w * Punycode.BASE - t;
if (guard_w > Integer.MAX_VALUE) {
throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "overflow");
}
}
bias = Punycode.adapt(i - old_i, output_size + 1, old_i == 0);
n += i / (output_size + 1);
i %= output_size + 1;
ucs4.add(i, n);
}
return new PyUnicode(ucs4);
}
public static String PyUnicode_EncodeIDNA(PyUnicode input,
String errors) {
throw new UnsupportedOperationException();
// 1. If the sequence contains any code points outside the ASCII range
// (0..7F) then proceed to step 2, otherwise skip to step 3.
//
// 2. Perform the steps specified in [NAMEPREP] and fail if there is an
// error. The AllowUnassigned flag is used in [NAMEPREP].
// this basically enails changing out space, etc.
//
// 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
//
// (a) Verify the absence of non-LDH ASCII code points; that is, the
// absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
//
// (b) Verify the absence of leading and trailing hyphen-minus; that
// is, the absence of U+002D at the beginning and end of the
// sequence.
//
// 4. If the sequence contains any code points outside the ASCII range
// (0..7F) then proceed to step 5, otherwise skip to step 8.
//
// 5. Verify that the sequence does NOT begin with the ACE prefix.
//
// 6. Encode the sequence using the encoding algorithm in [PUNYCODE] and
// fail if there is an error.
//
// 7. Prepend the ACE prefix.
//
// 8. Verify that the number of code points is in the range 1 to 63
// inclusive.
}
public static PyUnicode PyUnicode_DecodeIDNA(String input, String errors) {
throw new UnsupportedOperationException();
}
/* --- Utility methods -------------------------------------------- */
public static PyObject encoding_error(String errors,
String encoding,
String toEncode,
int start,
int end,
String reason) {
PyObject errorHandler = lookup_error(errors);
PyException exc = Py.UnicodeEncodeError(encoding,
toEncode,
start,
end,
reason);
exc.normalize();
PyObject replacement = errorHandler.__call__(new PyObject[]{exc.value});
checkErrorHandlerReturn(errors, replacement);
return replacement;
}
public static int insertReplacementAndGetResume(StringBuilder partialDecode,
String errors,
String encoding,
String toDecode,
int start,
int end,
String reason) {
if (errors != null) {
if (errors.equals(IGNORE)) {
return end;
} else if (errors.equals(REPLACE)) {
while (start < end) {
partialDecode.appendCodePoint(Py_UNICODE_REPLACEMENT_CHARACTER);
start++;
}
return end;
}
}
PyObject replacement = decoding_error(errors,
encoding,
toDecode,
start,
end,
reason);
checkErrorHandlerReturn(errors, replacement);
partialDecode.append(replacement.__getitem__(0).toString());
return calcNewPosition(toDecode.length(), replacement);
}
public static PyObject decoding_error(String errors,
String encoding,
String toEncode,
int start,
int end,
String reason) {
PyObject errorHandler = lookup_error(errors);
PyException exc = Py.UnicodeDecodeError(encoding,
toEncode,
start,
end,
reason);
exc.normalize();
return errorHandler.__call__(new PyObject[]{exc.value});
}
private static void checkErrorHandlerReturn(String errors,
PyObject replacement) {
if (!(replacement instanceof PyTuple) || replacement.__len__() != 2 || !(replacement.__getitem__(0) instanceof PyBaseString) || !(replacement.__getitem__(1) instanceof PyInteger)) {
throw new PyException(Py.TypeError, "error_handler " + errors + " must return a tuple of (replacement, new position)");
}
}
}
class StringSubsequenceIterator implements Iterator {
private final String s;
private int current, k, start, stop, step;
StringSubsequenceIterator(String s, int start, int stop, int step) {
// System.out.println("s=" + s.length() + ",start=" + start + ",stop=" + stop);
this.s = s;
k = 0;
current = start;
this.start = start;
this.stop = stop;
this.step = step;
// this bounds checking is necessary to convert between use of code units elsewhere, and codepoints here
// it would be nice if it were unnecessary!
int count = getCodePointCount(s);
if (start >= count) {
this.stop = -1;
}
else if (stop >= count) {
this.stop = count;
}
for (int i = 0; i < start; i++) {
nextCodePoint();
}
}
StringSubsequenceIterator(String s) {
this(s, 0, getCodePointCount(s), 1);
}
private static int getCodePointCount(String s) {
return s.codePointCount(0, s.length());
}
public boolean hasNext() {
return current < stop;
}
public Object next() {
int codePoint = nextCodePoint();
current += 1;
for (int j = 1; j < step && hasNext(); j++) {
nextCodePoint();
current += 1;
}
return codePoint;
}
private int nextCodePoint() {
int U;
// System.out.println("k=" + k);
int W1 = s.charAt(k);
if (W1 >= 0xD800 && W1 < 0xDC00) {
int W2 = s.charAt(k + 1);
U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
k += 2;
} else {
U = W1;
k += 1;
}
return U;
}
public void remove() {
throw new UnsupportedOperationException("Not supported on String objects (immutable)");
}
}