codecs.java example

Explorer
jython-on-android-master
- src
  - org
/*
 * Copyright 2000 Finn Bock
 *
 * This program contains material copyrighted by:
 * Copyright (c) Corporation for National Research Initiatives.
 * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
 */
package org.python.core;

import java.nio.charset.Charset;

import java.util.ArrayList;
import java.util.Iterator;

import org.python.core.util.StringUtil;

/**
 * Contains the implementation of the builtin codecs.
 * @since Jython 2.0
 */
public class codecs {

    public static final String BACKSLASHREPLACE = "backslashreplace";
    public static final String IGNORE = "ignore";
    public static final String REPLACE = "replace";
    public static final String XMLCHARREFREPLACE = "xmlcharrefreplace";
    private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
    private static PyList searchPath;
    private static PyStringMap searchCache;
    private static PyStringMap errorHandlers;
    /** Used to synchronize registry_init. */
    private static final Object INIT_LOCK = new Object();
    private static String default_encoding = "ascii";

    public static String getDefaultEncoding() {
        return default_encoding;
    }

    public static void setDefaultEncoding(String encoding) {
        lookup(encoding);
        default_encoding = encoding;
    }

    public static PyObject lookup_error(String handlerName) {
        registry_init();
        if (handlerName == null) {
            handlerName = "strict";
        }
        PyObject handler = errorHandlers.__finditem__(handlerName.intern());
        if (handler == null) {
            throw new PyException(Py.LookupError,
                    "unknown error handler name '" + handlerName + "'");
        }
        return handler;
    }

    public static void register_error(String name, PyObject error) {
        registry_init();
        if (!error.isCallable()) {
            throw Py.TypeError("argument must be callable");
        }
        errorHandlers.__setitem__(name.intern(), error);
    }

    public static void register(PyObject search_function) {
        registry_init();
        if (!search_function.isCallable()) {
            throw Py.TypeError("argument must be callable");
        }
        searchPath.append(search_function);
    }

    public static PyTuple lookup(String encoding) {
        registry_init();
        PyString v = new PyString(normalizestring(encoding));
        PyObject cached = searchCache.__finditem__(v);
        if (cached != null) {
            return (PyTuple)cached;
        }

        if (searchPath.__len__() == 0) {
            throw new PyException(Py.LookupError,
                "no codec search functions registered: can't find encoding '" + encoding + "'");
        }

        for (PyObject func : searchPath.asIterable()) {
            PyObject created = func.__call__(v);
            if (created == Py.None) {
                continue;
            }
            if (!(created instanceof PyTuple) || created.__len__() != 4) {
                throw Py.TypeError("codec search functions must return 4-tuples");
            }
            searchCache.__setitem__(v, created);
            return (PyTuple)created;
        }
        throw new PyException(Py.LookupError, "unknown encoding '" + encoding + "'");
    }

    private static String normalizestring(String string) {
        return string.toLowerCase().replace(' ', '-');
    }
    private static boolean import_encodings_called;

    private static void import_encodings() {
        if (!import_encodings_called) {
            import_encodings_called = true;
            try {
                imp.load("encodings");
            } catch (PyException exc) {
                if (exc.type != Py.ImportError) {
                    throw exc;
                }
            }
        }
    }

    public static PyObject decode(PyString v, String encoding, String errors) {
        if (encoding == null) {
            encoding = getDefaultEncoding();
        } else {
            encoding = normalizestring(encoding);
        }

        if (errors != null) {
            errors = errors.intern();
        }

        /* Shortcut for ascii encoding */
        if (encoding.equals("ascii")) {
            return wrapDecodeResult(PyUnicode_DecodeASCII(v.toString(), v.__len__(), errors));
        }

        /* Decode via the codec registry */
        PyObject decoder;
        try {
            decoder = lookup(encoding).__getitem__(1);
        } catch (PyException ex) {
            if (ex.match(Py.LookupError)) {
                // If we couldn't find an encoding, see if we have a builtin
                if (encoding.equals("utf-8")) {
                    return wrapDecodeResult(PyUnicode_DecodeUTF8(v.toString(), errors));
                } else if(encoding.equals("utf-7")) {
                    return wrapDecodeResult(PyUnicode_DecodeUTF7(v.toString(), errors));
                } else if(encoding.equals("latin-1")) {
                    return wrapDecodeResult(PyUnicode_DecodeLatin1(v.toString(), v.__len__(),
                        errors));
                }
            }
            throw ex;
        }
        PyObject result;
        if (errors != null) {
            result = decoder.__call__(v, new PyString(errors));
        } else {
            result = decoder.__call__(v);
        }

        if (!(result instanceof PyTuple) || result.__len__() != 2) {
            throw Py.TypeError("decoder must return a tuple (object,integer)");
        }
        return result.__getitem__(0);
    }

    private static PyUnicode wrapDecodeResult(String result) {
        return new PyUnicode(result, true);
    }

    public static String encode(PyString v, String encoding,
            String errors) {
        if (encoding == null) {
            encoding = getDefaultEncoding();
        } else {
            encoding = normalizestring(encoding);
        }

        if (errors != null) {
            errors = errors.intern();
        }

        /* Shortcuts for common default encodings.  latin-1 must not use the
         * lookup registry for the encodings module to work correctly */
        if (encoding.equals("latin-1")) {
            return PyUnicode_EncodeLatin1(v.toString(), v.__len__(), errors);
        } else if (encoding.equals("ascii")) {
            return PyUnicode_EncodeASCII(v.toString(), v.__len__(), errors);
        }

        /* Encode via the codec registry */
        PyObject encoder;
        try {
            encoder = lookup(encoding).__getitem__(0);
        } catch (PyException ex) {
            if (ex.match(Py.LookupError)) {
                // If we couldn't find an encoding, see if we have a builtin
                if (encoding.equals("utf-8")) {
                    return PyUnicode_EncodeUTF8(v.toString(), errors);
                } else if(encoding.equals("utf-7")) {
                    return codecs.PyUnicode_EncodeUTF7(v.toString(), false, false, errors);
                }
            }
            throw ex;
        }
        PyObject result;
        if (errors != null) {
            result = encoder.__call__(v, new PyString(errors));
        } else {
            result = encoder.__call__(v);
        }

        if (!(result instanceof PyTuple) || result.__len__() != 2) {
            throw Py.TypeError("encoder must return a tuple (object,integer)");
        }
        PyObject encoded = result.__getitem__(0);
        if (encoded instanceof PyString) {
            return encoded.toString();
        } else {
            throw Py.TypeError("encoder did not return a string/unicode object (type="
                    + encoded.getType().fastGetName() + ")");
        }
    }

    public static PyObject strict_errors(PyObject[] args, String[] kws) {
        ArgParser ap = new ArgParser("strict_errors", args, kws, "exc");
        PyObject exc = ap.getPyObject(0);
        if (Py.isInstance(exc, Py.UnicodeDecodeError)) {
            throw new PyException(Py.UnicodeDecodeError, exc);
        } else if (Py.isInstance(exc, Py.UnicodeEncodeError)) {
            throw new PyException(Py.UnicodeEncodeError, exc);
        } else if (Py.isInstance(exc, Py.UnicodeTranslateError)) {
            throw new PyException(Py.UnicodeTranslateError, exc);
        }
        throw wrong_exception_type(exc);
    }

    public static PyObject ignore_errors(PyObject[] args, String[] kws) {
        ArgParser ap = new ArgParser("ignore_errors", args, kws, "exc");
        PyObject exc = ap.getPyObject(0);
        if (!isUnicodeError(exc)) {
            throw wrong_exception_type(exc);
        }
        PyObject end = exc.__getattr__("end");
        return new PyTuple(Py.java2py(""), end);
    }

    private static boolean isUnicodeError(PyObject exc) {
        return Py.isInstance(exc, Py.UnicodeDecodeError) ||
                Py.isInstance(exc, Py.UnicodeEncodeError) ||
                Py.isInstance(exc, Py.UnicodeTranslateError);
    }

    public static PyObject replace_errors(PyObject[] args, String[] kws) {
        ArgParser ap = new ArgParser("replace_errors", args, kws, "exc");
        PyObject exc = ap.getPyObject(0);
        if (Py.isInstance(exc, Py.UnicodeEncodeError)) {
            int end = exceptions.getEnd(exc, true);
            return new PyTuple(new PyUnicode("?"), Py.newInteger(end));
        } else if (Py.isInstance(exc, Py.UnicodeDecodeError)) {
            int end = exceptions.getEnd(exc, false);
            return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER),
                               Py.newInteger(end));
        } else if (Py.isInstance(exc, Py.UnicodeTranslateError)) {
            int end = exceptions.getEnd(exc, true);
            return new PyTuple(new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER),
                               Py.newInteger(end));
        }
        throw wrong_exception_type(exc);
    }

    public static PyObject xmlcharrefreplace_errors(PyObject[] args, String[] kws) {
        ArgParser ap = new ArgParser("xmlcharrefreplace_errors", args, kws, "exc");
        PyObject exc = ap.getPyObject(0);
        if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
            throw wrong_exception_type(exc);
        }
        int start = ((PyInteger) exc.__getattr__("start")).getValue();
        int end = ((PyInteger) exc.__getattr__("end")).getValue();
        String object = exc.__getattr__("object").toString();
        StringBuilder replacement = new StringBuilder();
        xmlcharrefreplace_internal(start, end, object, replacement);
        return new PyTuple(Py.java2py(replacement.toString()), exc.__getattr__("end"));
    }

    public static StringBuilder xmlcharrefreplace(int start, int end, String toReplace) {
        StringBuilder replacement = new StringBuilder();
        xmlcharrefreplace_internal(start, end, toReplace, replacement);
        return replacement;
    }

    private static void xmlcharrefreplace_internal(int start, int end, String object, StringBuilder replacement) {
        for (int i = start; i < end; i++) {
            replacement.append("&#");
            char cur = object.charAt(i);
            int digits;
            int base;
            if (cur < 10) {
                digits = 1;
                base = 1;
            } else if (cur < 100) {
                digits = 2;
                base = 10;
            } else if (cur < 1000) {
                digits = 3;
                base = 100;
            } else if (cur < 10000) {
                digits = 4;
                base = 1000;
            } else if (cur < 100000) {
                digits = 5;
                base = 10000;
            } else if (cur < 1000000) {
                digits = 6;
                base = 100000;
            } else {
                digits = 7;
                base = 1000000;
            }
            while (digits-- > 0) {
                replacement.append((char) ('0' + cur / base));
                cur %= base;
                base /= 10;
            }
            replacement.append(';');
        }
    }

    private static PyException wrong_exception_type(PyObject exc) {
        PyObject excClass = exc.__getattr__("__class__");
        PyObject className = excClass.__getattr__("__name__");
        return new PyException(Py.TypeError, "Don't know how to handle " + className + " in error callback");
    }
    static char hexdigits[] = {
        '0', '1', '2', '3', '4', '5', '6', '7',
        '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
    };

    public static PyObject backslashreplace_errors(PyObject[] args, String[] kws) {
        ArgParser ap = new ArgParser("backslashreplace_errors", args, kws, "exc");
        PyObject exc = ap.getPyObject(0);
        if (!Py.isInstance(exc, Py.UnicodeEncodeError)) {
            throw wrong_exception_type(exc);
        }
        int start = ((PyInteger) exc.__getattr__("start")).getValue();
        int end = ((PyInteger) exc.__getattr__("end")).getValue();
        String object = exc.__getattr__("object").toString();
        StringBuilder replacement = new StringBuilder();
        backslashreplace_internal(start, end, object, replacement);
        return new PyTuple(Py.java2py(replacement.toString()), exc.__getattr__("end"));
    }

    public static StringBuilder backslashreplace(int start, int end, String toReplace) {
        StringBuilder replacement = new StringBuilder();
        backslashreplace_internal(start, end, toReplace, replacement);
        return replacement;
    }

    private static void backslashreplace_internal(int start, int end, String object, StringBuilder replacement) {
        for (Iterator<Integer> iter = new StringSubsequenceIterator(object, start, end, 1); iter.hasNext();) {
            int c = iter.next();
            replacement.append('\\');
            if (c >= 0x00010000) {
                replacement.append('U');
                replacement.append(hexdigits[(c >> 28) & 0xf]);
                replacement.append(hexdigits[(c >> 24) & 0xf]);
                replacement.append(hexdigits[(c >> 20) & 0xf]);
                replacement.append(hexdigits[(c >> 16) & 0xf]);
                replacement.append(hexdigits[(c >> 12) & 0xf]);
                replacement.append(hexdigits[(c >> 8) & 0xf]);
            } else if (c >= 0x100) {
                replacement.append('u');
                replacement.append(hexdigits[(c >> 12) & 0xf]);
                replacement.append(hexdigits[(c >> 8) & 0xf]);
            } else {
                replacement.append('x');
            }
            replacement.append(hexdigits[(c >> 4) & 0xf]);
            replacement.append(hexdigits[c & 0xf]);
        }
    }

    private static void registry_init() {
        synchronized (INIT_LOCK) {
            if (searchPath != null) {
                return;
            }
            searchPath = new PyList();
            searchCache = new PyStringMap();
            errorHandlers = new PyStringMap();
            String[] builtinErrorHandlers = new String[]{"strict",
                IGNORE,
                REPLACE,
                XMLCHARREFREPLACE,
                BACKSLASHREPLACE
            };
            for (String builtinErrorHandler : builtinErrorHandlers) {
                register_error(builtinErrorHandler, Py.newJavaFunc(codecs.class,
                        builtinErrorHandler + "_errors"));
            }
            import_encodings();
        }
    }
    /* --- UTF-7 Codec -------------------------------------------------------- */

    /* see RFC2152 for details */
    public static char utf7_special[] = {
        /*
         * indicate whether a UTF-7 character is special i.e. cannot be directly
         * encoded: 0 - not special 1 - special 2 - whitespace (optional) 3 -
         * RFC2152 Set O (optional)
         */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
        3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
        3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1
    ,


           };

    private static boolean SPECIAL(char c, boolean encodeO, boolean encodeWS){
    return (c>127 || utf7_special[(c)] == 1) ||
                (encodeWS && (utf7_special[(c)] == 2)) ||
                (encodeO && (utf7_special[(c)] == 3));
    }
    private static final String B64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

    private static char B64(int n) {
        return B64_CHARS.charAt(n & 0x3f);
    }

    private static boolean B64CHAR(char c) {
        return B64_CHARS.indexOf(c) != -1;
    }

    private static int UB64(char c) {
        return ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4);
    }

    // note that we follow CPython 2.5 exactly here - it does not support surrogates,
    // but has to process as-if they are there for replacement purposes
    // fortunately no one really cares about utf-7
    public static String PyUnicode_DecodeUTF7(String str, String errors) {
        int s = 0;
        int e = str.length();
        boolean inShift = false;
        int bitsInCharsleft = 0;
        long charsleft = 0;
        boolean surrogate = false;
        StringBuilder unicode = new StringBuilder(e);
        while (s < e) {
            // restart:
            char ch = str.charAt(s);
            if (inShift) {
                if ((ch == '-') || !B64CHAR(ch)) {
                    inShift = false;
                    s++;
                    while (bitsInCharsleft >= 16) {
                        bitsInCharsleft -= 16;
                        char outCh = (char) ((charsleft >> bitsInCharsleft) & 0xffff);
                        if (surrogate) {
                            s = codecs.insertReplacementAndGetResume(unicode,
                                    errors,
                                    "utf-7",
                                    str,
                                    s,
                                    s + 1,
                                    "code pairs are not supported");
                            surrogate = false;
                        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {
                            surrogate = true;
                        } else {
                            unicode.append(outCh);
                        }
                    }
                    if (bitsInCharsleft >= 6) {
                        /*
                         * The shift sequence has a partial character in it. If
                         * bitsleft < 6 then we could just classify it as
                         * padding but that is not the case here
                         */
                        s = insertReplacementAndGetResume(unicode,
                                errors,
                                "utf-7",
                                str,
                                s,
                                s + 1,
                                "partial character in shift sequence");
                    }
                    /*
                     * According to RFC2152 the remaining bits should be zero.
                     * We choose to signal an error/insert a replacement
                     * character here so indicate the potential of a misencoded
                     * character.
                     */
                    if (bitsInCharsleft > 0 && ((charsleft << 5 - bitsInCharsleft) & 0x1f) > 0) {
                        s = insertReplacementAndGetResume(unicode,
                                errors,
                                "utf-7",
                                str,
                                s,
                                s + 1,
                                "non-zero padding bits in shift sequence");
                    }
                    if (ch == '-') {
                        if ((s < e) && (str.charAt(s) == '-')) {
                            unicode.append('-');
                            inShift = true;
                        }
                    } else if (SPECIAL(ch, false, false)) {
                        s = insertReplacementAndGetResume(unicode,
                                errors,
                                "utf-7",
                                str,
                                s,
                                s + 1,
                                "unexpected special character");
                    } else {
                        unicode.append(ch);
                    }
                } else {
                    charsleft = (charsleft << 6) | UB64(ch);
                    bitsInCharsleft += 6;
                    s++;
                    while (bitsInCharsleft >= 16) {
                        bitsInCharsleft -= 16;
                        char outCh = (char) ((charsleft >> bitsInCharsleft) & 0xffff);
                        if (surrogate) {
                            s = codecs.insertReplacementAndGetResume(unicode,
                                    errors,
                                    "utf-7",
                                    str,
                                    s,
                                    s + 1,
                                    "code pairs are not supported");
                        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {
                            surrogate = true;
                        } else {
                            unicode.append(outCh);
                        }
                    }
                }
            } else if (ch == '+') {
                s++;
                if (s < e && str.charAt(s) == '-') {
                    s++;
                    unicode.append('+');
                } else {
                    inShift = true;
                    bitsInCharsleft = 0;
                }
            } else if (SPECIAL(ch, false, false)) {
                s = insertReplacementAndGetResume(unicode,
                        errors,
                        "utf-7",
                        str,
                        s,
                        s + 1,
                        "unexpected special character");
            } else {
                unicode.append(ch);
                s++;
            }
            if (inShift && s == e) {
                s = insertReplacementAndGetResume(unicode,
                        errors,
                        "utf-7",
                        str,
                        s,
                        s,
                        "unterminated shift sequence");
            }
        }
        return unicode.toString();
    }

    public static String PyUnicode_EncodeUTF7(String str,
                                              boolean encodeSetO,
                                              boolean encodeWhiteSpace,
                                              String errors) {
        int size = str.length();

        if (size == 0) {
            return "";
        }
        boolean inShift = false;
        int bitsleft = 0;
        int charsleft = 0;

        StringBuilder v = new StringBuilder();

        for (int i = 0; i < size; ++i) {
            char ch = str.charAt(i);

            if (!inShift) {
                if (ch == '+') {
                    v.append('+');
                    v.append('-');
                } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
                    charsleft = ch;
                    bitsleft = 16;
                    v.append('+');
                    while (bitsleft >= 6) {
                        v.append(B64(charsleft >> (bitsleft - 6)));
                        bitsleft -= 6;
                    }
                    inShift = bitsleft > 0;
                } else {
                    v.append(ch);
                }
            } else {
                if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
                    v.append(B64(charsleft << (6 - bitsleft)));
                    charsleft = 0;
                    bitsleft = 0;
                    /* Characters not in the BASE64 set implicitly unshift the sequence
                    so no '-' is required, except if the character is itself a '-' */
                    if (B64CHAR(ch) || ch == '-') {
                        v.append('-');
                    }
                    inShift = false;
                    v.append(ch);
                } else {
                    bitsleft += 16;
                    charsleft = (charsleft << 16) | ch;
                    while (bitsleft >= 6) {
                        v.append(B64(charsleft >> (bitsleft - 6)));
                        bitsleft -= 6;
                    }
                    /* If the next character is special then we dont' need to terminate
                    the shift sequence. If the next character is not a BASE64 character
                    or '-' then the shift sequence will be terminated implicitly and we
                    don't have to insert a '-'. */

                    if (bitsleft == 0) {
                        if (i + 1 < size) {
                            char ch2 = str.charAt(i + 1);

                            if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {

                            } else if (B64CHAR(ch2) || ch2 == '-') {
                                v.append('-');
                                inShift = false;
                            } else {
                                inShift = false;
                            }

                        } else {
                            v.append('-');
                            inShift = false;
                        }
                    }
                }
            }
        }
        if (bitsleft > 0) {
            v.append(B64(charsleft << (6 - bitsleft)));
            v.append('-');
        }
        return v.toString();
    }
    /* --- UTF-8 Codec ---------------------------------------------------- */
    private static byte utf8_code_length[] = {
        /* Map UTF-8 encoded prefix byte to sequence length.  zero means
        illegal prefix.  see RFC 2279 for details */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
    };


    // TODO: need to modify to use a codepoint approach (which is almost the case now,
    // ch is an
    public static String PyUnicode_DecodeUTF8(String str, String errors) {
        return PyUnicode_DecodeUTF8Stateful(str, errors, null);
    }

    public static String PyUnicode_DecodeUTF8Stateful(String str, String errors, int[] consumed) {
        int size = str.length();
        StringBuilder unicode = new StringBuilder(size);

        /* Unpack UTF-8 encoded data */
        int i;
        for (i = 0; i < size;) {
            int ch = str.charAt(i);

            if (ch < 0x80) {
                unicode.append((char) ch);
                i++;
                continue;
            }
            if (ch > 0xFF) {
                i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "ordinal not in range(255)");
                continue;
            }

            int n = utf8_code_length[ch];

            if (i + n > size) {
                if (consumed != null) {
                    break;
                }
                i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected end of data");
                continue;
            }


            switch (n) {
                case 0:
                    i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected code byte");
                    continue;
                case 1:
                    i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "internal error");
                    continue;
                case 2:
                    char ch1 = str.charAt(i + 1);
                    if ((ch1 & 0xc0) != 0x80) {
                        i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "invalid data");
                        continue;
                    }
                    ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
                    if (ch < 0x80) {
                        i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "illegal encoding");
                        continue;
                    } else {
                        unicode.appendCodePoint(ch);
                    }
                    break;

                case 3:
                    ch1 = str.charAt(i + 1);
                    char ch2 = str.charAt(i + 2);
                    if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
                        i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "invalid data");
                        continue;
                    }
                    ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
                    if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
                        i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "illegal encoding");
                        continue;
                    } else {
                        unicode.appendCodePoint(ch);
                    }
                    break;

                case 4:
                    ch1 = str.charAt(i + 1);
                    ch2 = str.charAt(i + 2);
                    char ch3 = str.charAt(i + 3);
                    if ((ch1 & 0xc0) != 0x80 ||
                            (ch2 & 0xc0) != 0x80 ||
                            (ch3 & 0xc0) != 0x80) {
                        i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "invalid data");
                        continue;
                    }
                    ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
                            ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
                    /* validate and convert to UTF-16 */
                    if ((ch < 0x10000) || /* minimum value allowed for 4
                            byte encoding */
                            (ch > 0x10ffff)) {  /* maximum value allowed for
                        UTF-16 */
                        i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "illegal encoding");
                        continue;
                    }

                    unicode.appendCodePoint(ch);
                    break;

                default:
                    // TODO: support
                /* Other sizes are only needed for UCS-4 */
                    i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + n, "unsupported Unicode code range");
                    continue;
            }
            i += n;
        }

        if (consumed != null) {
            consumed[0] = i;
        }

        return unicode.toString();
    }

    public static String PyUnicode_EncodeUTF8(String str, String errors) {
        return StringUtil.fromBytes(Charset.forName("UTF-8").encode(str));
    }

    public static String PyUnicode_DecodeASCII(String str, int size, String errors) {
        return PyUnicode_DecodeIntLimited(str, size, errors, "ascii", 128);
    }

    public static String PyUnicode_DecodeLatin1(String str, int size, String errors) {
        return PyUnicode_DecodeIntLimited(str, size, errors, "latin-1", 256);
    }

    private static String PyUnicode_DecodeIntLimited(String str, int size, String errors, String encoding, int limit) {
        StringBuilder v = new StringBuilder(size);

        String reason = "ordinal not in range(" + limit + ")";
        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch < limit) {
                v.append(ch);
            } else {
                i = insertReplacementAndGetResume(v, errors,
                        encoding,
                        str,
                        i,
                        i + 1,
                        reason) - 1;
            }
        }

        return v.toString();
    }

    public static String PyUnicode_EncodeASCII(String str, int size,
            String errors) {
        return PyUnicode_EncodeIntLimited(str, size, errors, "ascii", 128);
    }

    public static String PyUnicode_EncodeLatin1(String str, int size,
            String errors) {

        return PyUnicode_EncodeIntLimited(str, size, errors, "latin-1", 256);
    }

    private static String PyUnicode_EncodeIntLimited(String str, int size,
            String errors, String encoding, int limit) {
        String reason = "ordinal not in range(" + limit + ")";
        StringBuilder v = new StringBuilder(size);
        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch >= limit) {
                int nextGood = i + 1;
                for (; nextGood < size; nextGood++) {
                    if (str.charAt(nextGood) < limit) {
                        break;
                    }
                }
                if (errors != null) {
                    if (errors.equals(IGNORE)) {
                        i = nextGood - 1;
                        continue;
                    } else if (errors.equals(REPLACE)) {
                        for (int j = i; j < nextGood; j++) {
                            v.append('?');
                        }
                        i = nextGood - 1;
                        continue;
                    } else if (errors.equals(XMLCHARREFREPLACE)) {
                        v.append(xmlcharrefreplace(i, nextGood, str));
                        i = nextGood - 1;
                        continue;
                    } else if (errors.equals(BACKSLASHREPLACE)) {
                        v.append(backslashreplace(i, nextGood, str));
                        i = nextGood - 1;
                        continue;
                    }
                }
                PyObject replacement = encoding_error(errors,
                        encoding,
                        str,
                        i,
                        nextGood,
                        reason);
                String replStr = replacement.__getitem__(0).toString();
                for (int j = 0; j < replStr.length(); j++) {
                    if (replStr.charAt(j) >= limit) {
                        throw Py.UnicodeEncodeError(encoding, str, i + j, i + j + 1, reason);
                    }
                }
                v.append(replStr);
                i = calcNewPosition(size, replacement) - 1;
            } else {
                v.append(ch);
            }
        }
        return v.toString();
    }

    public static int calcNewPosition(int size, PyObject errorTuple) {
        int newPosition = ((PyInteger) errorTuple.__getitem__(1)).getValue();
        if (newPosition < 0) {
            newPosition = size + newPosition;
        }
        if (newPosition > size || newPosition < 0) {
            throw Py.IndexError(newPosition + " out of bounds of encoded string");
        }
        return newPosition;
    }
    /* --- RawUnicodeEscape Codec ---------------------------------------- */
    private static char[] hexdigit = "0123456789ABCDEF".toCharArray();

    // The modified flag is used by cPickle.
    public static String PyUnicode_EncodeRawUnicodeEscape(String str, String errors,
                                                          boolean modifed) {
        StringBuilder v = new StringBuilder(str.length());

        for (Iterator<Integer> iter = new PyUnicode(str).newSubsequenceIterator();
             iter.hasNext();) {
            int codePoint = iter.next();
            if (codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                // Map 32-bit characters to '\\Uxxxxxxxx'
                v.append("\\U");
                v.append(hexdigit[(codePoint >> 28) & 0xF]);
                v.append(hexdigit[(codePoint >> 24) & 0xF]);
                v.append(hexdigit[(codePoint >> 20) & 0xF]);
                v.append(hexdigit[(codePoint >> 16) & 0xF]);
                v.append(hexdigit[(codePoint >> 12) & 0xF]);
                v.append(hexdigit[(codePoint >> 8) & 0xF]);
                v.append(hexdigit[(codePoint >> 4) & 0xF]);
                v.append(hexdigit[codePoint & 0xF]);
            } else if (codePoint >= 256 || (modifed && (codePoint == '\\' || codePoint == '\n'))) {
                // Map 16-bit chararacters to '\\uxxxx'
                v.append("\\u");
                v.append(hexdigit[(codePoint >> 12) & 0xF]);
                v.append(hexdigit[(codePoint >> 8) & 0xF]);
                v.append(hexdigit[(codePoint >> 4) & 0xF]);
                v.append(hexdigit[codePoint & 0xF]);
            } else {
                v.append((char)codePoint);
            }
        }

        return v.toString();
    }

    public static String PyUnicode_DecodeRawUnicodeEscape(String str, String errors) {
        int size = str.length();
        StringBuilder v = new StringBuilder(size);

        for (int i = 0; i < size;) {
            char ch = str.charAt(i);
            // Non-escape characters are interpreted as Unicode ordinals
            if (ch != '\\') {
                v.append(ch);
                i++;
                continue;
            }

            // \\u-escapes are only interpreted if the number of leading backslashes is
            // odd
            int bs = i;
            while (i < size) {
                ch = str.charAt(i);
                if (ch != '\\') {
                    break;
                }
                v.append(ch);
                i++;
            }
            if (((i - bs) & 1) == 0 || i >= size || (ch != 'u' && ch != 'U')) {
                continue;
            }
            v.setLength(v.length() - 1);
            int count = ch == 'u' ? 4 : 8;
            i++;

            // \\uXXXX with 4 hex digits, \Uxxxxxxxx with 8
            int codePoint = 0, asDigit = -1;
            for (int j = 0; j < count; i++, j++) {
                if (i == size) {
                    // EOF in a truncated escape
                    asDigit = -1;
                    break;
                }

                ch = str.charAt(i);
                asDigit = Character.digit(ch, 16);
                if (asDigit == -1) {
                    break;
                }
                codePoint = ((codePoint << 4) & ~0xF) + asDigit;
            }
            if (asDigit == -1) {
                i = codecs.insertReplacementAndGetResume(v, errors, "rawunicodeescape", str, bs, i,
                                                         "truncated \\uXXXX");
            } else {
                v.appendCodePoint(codePoint);
            }
        }

        return v.toString();
    }

    private static class Punycode {
        // specified by punycode, http://www.ietf.org/rfc/rfc3492.txt
        private static final int BASE = 36;
        private static final int TMIN = 1;
        private static final int TMAX = 26;
        private static final int SKEW = 38;
        private static final int DAMP = 700;
        private static final int INITIAL_BIAS = 72;
        private static final int INITIAL_N = 128;
        private static final int BASIC = 0x80;

        private Punycode() {

        }

        private static int adapt(int delta, int numpoints, boolean firsttime) {
            delta = firsttime ? delta / DAMP : delta >> 1;
            delta += delta / numpoints;
            int k = 0;
            while (delta > (((BASE - TMIN) * TMAX) / 2)) {
                delta /= BASE - TMIN;
                k += BASE;
            }
            return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
        }

        private static boolean isBasic(int codePoint) {
            return codePoint < BASIC;
        }
    }

    public static String PyUnicode_EncodePunycode(PyUnicode input,
            String errors) {
        int n = Punycode.INITIAL_N;
        int delta = 0;
        long guard_delta;
        int bias = Punycode.INITIAL_BIAS;
        int b = 0;
        final StringBuilder buffer = new StringBuilder();
        for (Iterator<Integer> iter = input.iterator(); iter.hasNext();) {
            int c = iter.next();
            if (Punycode.isBasic(c)) {
                buffer.appendCodePoint(c);
                b++;
            }
        }
        if (b > 0) {
            buffer.appendCodePoint('-');
        }
        int h = b;
        int size = input.getCodePointCount();
        while (h < size) {
            int m = Integer.MAX_VALUE;
            int i = 0;
            int codePointIndex = 0;
            for (Iterator<Integer> iter = input.iterator(); iter.hasNext(); i++) {
                int c = iter.next();
                if (c > n && c < m) {
                    m = c;
                    codePointIndex = i;
                }
            }
            guard_delta = delta + ((m - n) * (h + 1));
            if (guard_delta > Integer.MAX_VALUE) {
                throw Py.UnicodeEncodeError("punycode", input.getString(), codePointIndex, codePointIndex + 1, "overflow");
            }
            delta = (int) guard_delta;

            n = m;
            i = 0;
            for (Iterator<Integer> iter = input.iterator(); iter.hasNext(); i++) {
                int c = iter.next();
                if (c < n) {
                    guard_delta = delta + 1;
                    if (guard_delta > Integer.MAX_VALUE) {
                        throw Py.UnicodeEncodeError("punycode", input.getString(), i, i + 1, "overflow");
                    }
                    delta = (int) guard_delta;
                }
                if (c == n) {
                    int q = delta;
                    for (int k = Punycode.BASE;; k += Punycode.BASE) {
                        int t = k <= bias ? Punycode.TMIN : (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
                        if (q < t) {
                            break;
                        }
                        buffer.appendCodePoint(t + ((q - t) % (Punycode.BASE - t)));
                        q = (q - t) / (Punycode.BASE - t);
                    }
                    buffer.appendCodePoint(q);
                    bias = Punycode.adapt(delta, h + 1, h == b);
                    delta = 0;
                    h++;
                }
            }
            delta++;
            n++;
        }
        return buffer.toString();
    }

    public static PyUnicode PyUnicode_DecodePunycode(String input, String errors) {

        int input_size = input.length();
        int output_size = 0;
        ArrayList<Integer> ucs4 = new ArrayList<Integer>(input_size);
        int j = 0;
        for (; j < input_size; j++) {
            int c = input.charAt(j);
            if (!Punycode.isBasic(c)) {
                throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "not basic");
            } else if (c == '-') {
                break;
            } else {
                ucs4.add(c);
                output_size++;
            }
        }

        int n = Punycode.INITIAL_N;
        int i = 0;
        int bias = Punycode.INITIAL_BIAS;
        while (j < input_size) {
            int old_i = i;
            int w = 1;
            for (int k = Punycode.BASE;; k += Punycode.BASE) {
                int c = input.charAt(j++);
                int digit = c - '0';
                long guard_i = i + digit * w;
                if (guard_i > Integer.MAX_VALUE) {
                    throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "overflow");
                }
                i = (int) guard_i;
                int t = k <= bias ? Punycode.TMIN : (k >= bias + Punycode.TMAX ? Punycode.TMAX : k - bias);
                if (digit < t) {
                    break;
                }
                long guard_w = w * Punycode.BASE - t;
                if (guard_w > Integer.MAX_VALUE) {
                    throw Py.UnicodeDecodeError("punycode", input, j, j + 1, "overflow");
                }
            }
            bias = Punycode.adapt(i - old_i, output_size + 1, old_i == 0);
            n += i / (output_size + 1);
            i %= output_size + 1;
            ucs4.add(i, n);

        }
        return new PyUnicode(ucs4);
    }

    public static String PyUnicode_EncodeIDNA(PyUnicode input,
            String errors) {

        throw new UnsupportedOperationException();


//   1. If the sequence contains any code points outside the ASCII range
//      (0..7F) then proceed to step 2, otherwise skip to step 3.
//
//   2. Perform the steps specified in [NAMEPREP] and fail if there is an
//      error.  The AllowUnassigned flag is used in [NAMEPREP].
// this basically enails changing out space, etc.
//
//   3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
//
//     (a) Verify the absence of non-LDH ASCII code points; that is, the
//         absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
//
//     (b) Verify the absence of leading and trailing hyphen-minus; that
//         is, the absence of U+002D at the beginning and end of the
//         sequence.
//
//   4. If the sequence contains any code points outside the ASCII range
//      (0..7F) then proceed to step 5, otherwise skip to step 8.
//
//   5. Verify that the sequence does NOT begin with the ACE prefix.
//
//   6. Encode the sequence using the encoding algorithm in [PUNYCODE] and
//      fail if there is an error.
//
//   7. Prepend the ACE prefix.
//
//   8. Verify that the number of code points is in the range 1 to 63
//      inclusive.

    }

    public static PyUnicode PyUnicode_DecodeIDNA(String input, String errors) {
        throw new UnsupportedOperationException();
    }

    /* --- Utility methods -------------------------------------------- */
    public static PyObject encoding_error(String errors,
            String encoding,
            String toEncode,
            int start,
            int end,
            String reason) {
        PyObject errorHandler = lookup_error(errors);
        PyException exc = Py.UnicodeEncodeError(encoding,
                toEncode,
                start,
                end,
                reason);
        exc.normalize();
        PyObject replacement = errorHandler.__call__(new PyObject[]{exc.value});
        checkErrorHandlerReturn(errors, replacement);
        return replacement;
    }

    public static int insertReplacementAndGetResume(StringBuilder partialDecode,
            String errors,
            String encoding,
            String toDecode,
            int start,
            int end,
            String reason) {
        if (errors != null) {
            if (errors.equals(IGNORE)) {
                return end;
            } else if (errors.equals(REPLACE)) {
                while (start < end) {
                    partialDecode.appendCodePoint(Py_UNICODE_REPLACEMENT_CHARACTER);
                    start++;
                }
                return end;
            }
        }
        PyObject replacement = decoding_error(errors,
                encoding,
                toDecode,
                start,
                end,
                reason);
        checkErrorHandlerReturn(errors, replacement);
        partialDecode.append(replacement.__getitem__(0).toString());
        return calcNewPosition(toDecode.length(), replacement);
    }

    public static PyObject decoding_error(String errors,
            String encoding,
            String toEncode,
            int start,
            int end,
            String reason) {
        PyObject errorHandler = lookup_error(errors);
        PyException exc = Py.UnicodeDecodeError(encoding,
                toEncode,
                start,
                end,
                reason);
        exc.normalize();
        return errorHandler.__call__(new PyObject[]{exc.value});
    }

    private static void checkErrorHandlerReturn(String errors,
            PyObject replacement) {
        if (!(replacement instanceof PyTuple) || replacement.__len__() != 2 || !(replacement.__getitem__(0) instanceof PyBaseString) || !(replacement.__getitem__(1) instanceof PyInteger)) {
            throw new PyException(Py.TypeError, "error_handler " + errors + " must return a tuple of (replacement, new position)");
        }
    }
}


class StringSubsequenceIterator implements Iterator {

    private final String s;
    private int current,  k,  start,  stop,  step;

    StringSubsequenceIterator(String s, int start, int stop, int step) {
//        System.out.println("s=" + s.length() + ",start=" + start + ",stop=" + stop);
        this.s = s;
        k = 0;
        current = start;
        this.start = start;
        this.stop = stop;
        this.step = step;

        // this bounds checking is necessary to convert between use of code units elsewhere, and codepoints here
        // it would be nice if it were unnecessary!
        int count = getCodePointCount(s);
        if (start >= count) {
            this.stop = -1;
        }
        else if (stop >= count) {
            this.stop = count;
        }

        for (int i = 0; i < start; i++) {
            nextCodePoint();
        }
    }

    StringSubsequenceIterator(String s) {
        this(s, 0, getCodePointCount(s), 1);
    }

    private static int getCodePointCount(String s) {
        return s.codePointCount(0, s.length());
    }

    public boolean hasNext() {
        return current < stop;
    }

    public Object next() {
        int codePoint = nextCodePoint();
        current += 1;
        for (int j = 1; j < step && hasNext(); j++) {
            nextCodePoint();
            current += 1;
        }
        return codePoint;
    }

    private int nextCodePoint() {
        int U;
//        System.out.println("k=" + k);
        int W1 = s.charAt(k);
        if (W1 >= 0xD800 && W1 < 0xDC00) {
            int W2 = s.charAt(k + 1);
            U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000;
            k += 2;
        } else {
            U = W1;
            k += 1;
        }
        return U;
    }

    public void remove() {
        throw new UnsupportedOperationException("Not supported on String objects (immutable)");
    }
}