codecs.java example

Explorer
HBuilder-opensource-master
/*
 * Copyright 2000 Finn Bock
 *
 * This program contains material copyrighted by:
 * Copyright (c) Corporation for National Research Initiatives.
 * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
 */

package org.python.core;

/**
 * Contains the implementation of the builtin codecs.
 * @since Jython 2.0
 */

public class codecs {
    private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;

    private static PyList searchPath = new PyList();
    private static PyStringMap searchCache = new PyStringMap();

    private static String default_encoding = "ascii";

    public static String getDefaultEncoding() {
        return default_encoding;
    }

    public static void setDefaultEncoding(String encoding) {
        lookup(encoding);
        default_encoding = encoding;
    }

    public static void register(PyObject search_function) {
        if (!search_function.isCallable()) {
            throw Py.TypeError("argument must be callable");
        }
        searchPath.append(search_function);
    }

    public static PyTuple lookup(String encoding) {
        import_encodings();
        PyString v = new PyString(normalizestring(encoding));
        PyObject result = searchCache.__finditem__(v);
        if (result != null) {
            return (PyTuple) result;
        }

        if (searchPath.__len__() == 0) {
            throw new PyException(Py.LookupError, "no codec search functions registered: " + "can't find encoding");
        }

        PyObject iter = searchPath.__iter__();
        PyObject func = null;
        while ((func = iter.__iternext__()) != null) {
            result = func.__call__(v);
            if (result == Py.None) {
                continue;
            }
            if (!(result instanceof PyTuple) || result.__len__() != 4) {
                throw Py.TypeError("codec search functions must " + "return 4-tuples");
            }
            break;
        }
        if (func == null) {
            throw new PyException(Py.LookupError, "unknown encoding " + encoding);
        }
        searchCache.__setitem__(v, result);
        return (PyTuple) result;
    }

    private static String normalizestring(String string) {
        return string.toLowerCase().replace(' ', '-');
    }

    private static boolean import_encodings_called = false;

    private static void import_encodings() {
        if (!import_encodings_called) {
            import_encodings_called = true;
            try {
                __builtin__.__import__("encodings");
            } catch (PyException exc) {
                if (exc.type != Py.ImportError) {
                    throw exc;
                }
            }
        }
    }

    public static String decode(PyString v, String encoding, String errors) {
        if (encoding == null) {
            encoding = getDefaultEncoding();
        } else {
            encoding = normalizestring(encoding);
        }

        if (errors != null) {
            errors = errors.intern();
        }

        /* Shortcuts for common default encodings */
        /*
                if (encoding.equals("utf-8"))
                    return utf_8_decode(v, errors).__getitem__(0).__str__();
                else if (encoding.equals("latin-1"))
                    ; //return PyUnicode_DecodeLatin1(s, size, errors);
                else if (encoding.equals("ascii"))
                    ; //return PyUnicode_DecodeASCII(s, size, errors);
        */
        if (encoding.equals("ascii")) {
            return PyUnicode_DecodeASCII(v.toString(), v.__len__(), errors);
        }

        /* Decode via the codec registry */
        PyObject decoder = getDecoder(encoding);
        PyObject result = null;
        if (errors != null) {
            result = decoder.__call__(v, new PyString(errors));
        } else {
            result = decoder.__call__(v);
        }

        if (!(result instanceof PyTuple) || result.__len__() != 2) {
            throw Py.TypeError("decoder must return a tuple " + "(object,integer)");
        }
        return result.__getitem__(0).toString();
    }

    private static PyObject getDecoder(String encoding) {
        PyObject codecs = lookup(encoding);
        return codecs.__getitem__(1);
    }

    public static String encode(PyString v, String encoding, String errors) {
        if (encoding == null) {
            encoding = getDefaultEncoding();
        } else {
            encoding = normalizestring(encoding);
        }

        if (errors != null) {
            errors = errors.intern();
        }

        /* Shortcuts for common default encodings */
        /*
                if (encoding.equals("utf-8"))
                    return PyUnicode_DecodeUTF8(v.toString(), v.__len__(), errors);
                else if (encoding.equals("latin-1"))
                    return PyUnicode_DecodeLatin1(v.toString(), v.__len__(), errors);
                else
        */

        if (encoding.equals("ascii")) {
            return PyUnicode_EncodeASCII(v.toString(), v.__len__(), errors);
        }

        /* Decode via the codec registry */
        PyObject encoder = getEncoder(encoding);
        PyObject result = null;
        if (errors != null) {
            result = encoder.__call__(v, new PyString(errors));
        } else {
            result = encoder.__call__(v);
        }

        if (!(result instanceof PyTuple) || result.__len__() != 2) {
            throw Py.TypeError("encoder must return a tuple " + "(object,integer)");
        }
        return result.__getitem__(0).toString();
    }

    private static PyObject getEncoder(String encoding) {
        PyObject codecs = lookup(encoding);
        return codecs.__getitem__(0);
    }

    /* --- UTF-8 Codec ---------------------------------------------------- */
    private static byte utf8_code_length[] = {
    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
        illegal prefix.  see RFC 2279 for details */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
            0, 0 };

    public static String PyUnicode_DecodeUTF8(String str, String errors) {
        int size = str.length();
        StringBuffer unicode = new StringBuffer(size);

        /* Unpack UTF-8 encoded data */
        for (int i = 0; i < size;) {
            int ch = str.charAt(i);
            if (ch > 0xFF) {
                codecs.decoding_error("utf-8", unicode, errors, "ordinal not in range(255)");
                i++;
                continue;
            }

            if (ch < 0x80) {
                unicode.append((char) ch);
                i++;
                continue;
            }

            int n = utf8_code_length[ch];

            if (i + n > size) {
                codecs.decoding_error("utf-8", unicode, errors, "unexpected end of data");
                i++;
                continue;
            }

            switch (n) {
                case 0:
                    codecs.decoding_error("utf-8", unicode, errors, "unexpected code byte");
                    i++;
                    continue;
                case 1:
                    codecs.decoding_error("utf-8", unicode, errors, "internal error");
                    i++;
                    continue;
                case 2:
                    char ch1 = str.charAt(i + 1);
                    if ((ch1 & 0xc0) != 0x80) {
                        codecs.decoding_error("utf-8", unicode, errors, "invalid data");
                        i++;
                        continue;
                    }
                    ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
                    if (ch < 0x80) {
                        codecs.decoding_error("utf-8", unicode, errors, "illegal encoding");
                        i++;
                        continue;
                    } else
                        unicode.append((char) ch);
                    break;

                case 3:
                    ch1 = str.charAt(i + 1);
                    char ch2 = str.charAt(i + 2);
                    if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
                        codecs.decoding_error("utf-8", unicode, errors, "invalid data");
                        i++;
                        continue;
                    }
                    ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
                    if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
                        codecs.decoding_error("utf-8", unicode, errors, "illegal encoding");
                        i++;
                        continue;
                    } else
                        unicode.append((char) ch);
                    break;

                case 4:
                    ch1 = str.charAt(i + 1);
                    ch2 = str.charAt(i + 2);
                    char ch3 = str.charAt(i + 3);
                    if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80) {
                        codecs.decoding_error("utf-8", unicode, errors, "invalid data");
                        i++;
                        continue;
                    }
                    ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) + ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
                    /* validate and convert to UTF-16 */
                    if ((ch < 0x10000) || /* minimum value allowed for 4
                                             byte encoding */
                    (ch > 0x10ffff)) { /* maximum value allowed for
                                          UTF-16 */
                        codecs.decoding_error("utf-8", unicode, errors, "illegal encoding");
                        i++;
                        continue;
                    }
                    /*  compute and append the two surrogates: */

                    /*  translate from 10000..10FFFF to 0..FFFF */
                    ch -= 0x10000;

                    /*  high surrogate = top 10 bits added to D800 */
                    unicode.append((char) (0xD800 + (ch >> 10)));

                    /*  low surrogate = bottom 10 bits added to DC00 */
                    unicode.append((char) (0xDC00 + (ch & ~0xFC00)));
                    break;

                default:
                    /* Other sizes are only needed for UCS-4 */
                    codecs.decoding_error("utf-8", unicode, errors, "unsupported Unicode code range");
                    i++;
            }
            i += n;
        }

        return unicode.toString();
    }

    public static String PyUnicode_EncodeUTF8(String str, String errors) {
        int size = str.length();
        StringBuffer v = new StringBuffer(size * 3);

        for (int i = 0; i < size;) {
            int ch = str.charAt(i++);
            if (ch < 0x80) {
                v.append((char) ch);
            } else if (ch < 0x0800) {
                v.append((char) (0xc0 | (ch >> 6)));
                v.append((char) (0x80 | (ch & 0x3f)));
            } else {
                if (0xD800 <= ch && ch <= 0xDFFF) {
                    if (i != size) {
                        int ch2 = str.charAt(i);
                        if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                            /* combine the two values */
                            ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;

                            v.append((char) ((ch >> 18) | 0xf0));
                            v.append((char) (0x80 | ((ch >> 12) & 0x3f)));
                            i++;
                        }
                    }
                } else {
                    v.append((char) (0xe0 | (ch >> 12)));
                }
                v.append((char) (0x80 | ((ch >> 6) & 0x3f)));
                v.append((char) (0x80 | (ch & 0x3f)));
            }
        }
        return v.toString();
    }

    /* --- 7-bit ASCII Codec -------------------------------------------- */

    public static String PyUnicode_DecodeASCII(String str, int size, String errors) {
        StringBuffer v = new StringBuffer(size);

        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch < 128) {
                v.append(ch);
            } else {
                decoding_error("ascii", v, errors, "ordinal not in range(128)");
                continue;
            }
        }

        return v.toString();
    }

    public static String PyUnicode_EncodeASCII(String str, int size, String errors) {
        StringBuffer v = new StringBuffer(size);

        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch >= 128) {
                encoding_error("ascii", v, errors, "ordinal not in range(128)");
            } else {
                v.append(ch);
            }
        }
        return v.toString();
    }

    /* --- RawUnicodeEscape Codec ---------------------------------------- */

    private static char[] hexdigit = "0123456789ABCDEF".toCharArray();

    // The modified flag is used by cPickle.
    public static String PyUnicode_EncodeRawUnicodeEscape(String str, String errors, boolean modifed) {

        int size = str.length();
        StringBuffer v = new StringBuffer(str.length());

        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) {
                v.append("\\u");
                v.append(hexdigit[(ch >>> 12) & 0xF]);
                v.append(hexdigit[(ch >>> 8) & 0xF]);
                v.append(hexdigit[(ch >>> 4) & 0xF]);
                v.append(hexdigit[ch & 0xF]);
            } else {
                v.append(ch);
            }
        }

        return v.toString();
    }

    public static String PyUnicode_DecodeRawUnicodeEscape(String str, String errors) {
        int size = str.length();
        StringBuffer v = new StringBuffer(size);

        for (int i = 0; i < size;) {
            char ch = str.charAt(i);

            /* Non-escape characters are interpreted as Unicode ordinals */
            if (ch != '\\') {
                v.append(ch);
                i++;
                continue;
            }

            /* \\u-escapes are only interpreted iff the number of leading
               backslashes is odd */
            int bs = i;
            while (i < size) {
                ch = str.charAt(i);
                if (ch != '\\')
                    break;
                v.append(ch);
                i++;
            }
            if (((i - bs) & 1) == 0 || i >= size || ch != 'u') {
                continue;
            }
            v.setLength(v.length() - 1);
            i++;

            /* \\uXXXX with 4 hex digits */
            int x = 0;
            for (int j = 0; j < 4; j++) {
                ch = str.charAt(i + j);
                int d = Character.digit(ch, 16);
                if (d == -1) {
                    codecs.decoding_error("unicode escape", v, errors, "truncated \\uXXXX");
                    break;
                }
                x = ((x << 4) & ~0xF) + d;
            }
            i += 4;
            v.append((char) x);
        }
        return v.toString();
    }

    /* --- Utility methods -------------------------------------------- */

    public static void encoding_error(String type, StringBuffer dest, String errors, String details) {
        if (errors == null || errors == "strict") {
            throw Py.UnicodeError(type + " encoding error: " + details);
        } else if (errors == "ignore") {
            //ignore
        } else if (errors == "replace") {
            dest.append('?');
        } else {
            throw Py.ValueError(type + " encoding error; " + "unknown error handling code: " + errors);
        }
    }

    public static void decoding_error(String type, StringBuffer dest, String errors, String details) {
        if (errors == null || errors == "strict") {
            throw Py.UnicodeError(type + " decoding error: " + details);
        } else if (errors == "ignore") {
            //ignore
        } else if (errors == "replace") {
            if (dest != null) {
                dest.append(Py_UNICODE_REPLACEMENT_CHARACTER);
            }
        } else {
            throw Py.ValueError(type + " decoding error; " + "unknown error handling code: " + errors);
        }
    }
}