/* * Copyright 2000 Finn Bock * * This program contains material copyrighted by: * Copyright (c) Corporation for National Research Initiatives. * Originally written by Marc-Andre Lemburg (mal@lemburg.com). */ package org.python.modules; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.Iterator; import org.python.core.Py; import org.python.core.PyDictionary; import org.python.core.PyInteger; import org.python.core.PyNone; import org.python.core.PyObject; import org.python.core.PyString; import org.python.core.PySystemState; import org.python.core.PyTuple; import org.python.core.PyUnicode; import org.python.core.codecs; import org.python.expose.ExposedType; public class _codecs { public static void register(PyObject search_function) { codecs.register(search_function); } public static PyTuple lookup(String encoding) { return codecs.lookup(encoding); } public static PyObject lookup_error(String handlerName) { return codecs.lookup_error(handlerName); } public static void register_error(String name, PyObject errorHandler) { codecs.register_error(name, errorHandler); } public static PyObject charmap_build(PyUnicode map) { return EncodingMap.buildEncodingMap(map); } private static PyTuple decode_tuple(String s, int len) { return new PyTuple(new PyUnicode(s), Py.newInteger(len)); } private static PyTuple decode_tuple_str(String s, int len) { return new PyTuple(new PyString(s), Py.newInteger(len)); } private static PyTuple encode_tuple(String s, int len) { return new PyTuple(new PyString(s), Py.newInteger(len)); } /* --- UTF-8 Codec --------------------------------------------------- */ public static PyTuple utf_8_decode(String str) { return utf_8_decode(str, null); } public static PyTuple utf_8_decode(String str, String errors) { return utf_8_decode(str, errors, false); } public static PyTuple utf_8_decode(String str, String errors, boolean final_) { int[] consumed = final_ ? null : new int[1]; return decode_tuple(codecs.PyUnicode_DecodeUTF8Stateful(str, errors, consumed), final_ ? str.length() : consumed[0]); } public static PyTuple utf_8_encode(String str) { return utf_8_encode(str, null); } public static PyTuple utf_8_encode(String str, String errors) { int size = str.length(); return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size); } /* --- UTF-7 Codec --------------------------------------------------- */ public static PyTuple utf_7_decode(String str) { return utf_7_decode(str, null); } public static PyTuple utf_7_decode(String str, String errors) { int size = str.length(); return decode_tuple(codecs.PyUnicode_DecodeUTF7(str, errors), size); } public static PyTuple utf_7_encode(String str) { return utf_7_encode(str, null); } public static PyTuple utf_7_encode(String str, String errors) { int size = str.length(); return encode_tuple(codecs.PyUnicode_EncodeUTF7(str, false, false, errors), size); } public static PyTuple escape_decode(String str) { return escape_decode(str, null); } public static PyTuple escape_decode(String str, String errors) { return decode_tuple_str(PyString.decode_UnicodeEscape(str, 0, str.length(), errors, true), str.length()); } public static PyTuple escape_encode(String str) { return escape_encode(str, null); } public static PyTuple escape_encode(String str, String errors) { return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length()); } /* --- Character Mapping Codec --------------------------------------- */ public static PyTuple charmap_decode(String str, String errors, PyObject mapping) { return charmap_decode(str, errors, mapping, false); } public static PyTuple charmap_decode(String str, String errors, PyObject mapping, boolean ignoreUnmapped) { int size = str.length(); StringBuilder v = new StringBuilder(size); for (int i = 0; i < size; i++) { char ch = str.charAt(i); if (ch > 0xFF) { i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, i, i + 1, "ordinal not in range(255)") - 1; continue; } PyObject w = Py.newInteger(ch); PyObject x = mapping.__finditem__(w); if (x == null) { if (ignoreUnmapped) { v.append(ch); } else { i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, i, i + 1, "no mapping found") - 1; } continue; } /* Apply mapping */ if (x instanceof PyInteger) { int value = ((PyInteger) x).getValue(); if (value < 0 || value > PySystemState.maxunicode) { throw Py.TypeError("character mapping must return " + "integer greater than 0 and less than sys.maxunicode"); } v.append((char) value); } else if (x == Py.None) { i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, i, i + 1, "character maps to <undefined>") - 1; } else if (x instanceof PyString) { v.append(x.toString()); } else { /* wrong return value */ throw Py.TypeError("character mapping must return " + "integer, None or str"); } } return decode_tuple(v.toString(), size); } // parallel to CPython's PyUnicode_TranslateCharmap public static PyObject translateCharmap(PyUnicode str, String errors, PyObject mapping) { StringBuilder buf = new StringBuilder(str.toString().length()); for (Iterator<Integer> iter = str.newSubsequenceIterator(); iter.hasNext();) { int codePoint = iter.next(); PyObject result = mapping.__finditem__(Py.newInteger(codePoint)); if (result == null) { // No mapping found means: use 1:1 mapping buf.appendCodePoint(codePoint); } else if (result == Py.None) { // XXX: We don't support the fancier error handling CPython does here of // capturing regions of chars removed by the None mapping to optionally // pass to an error handler. Though we don't seem to even use this // functionality anywhere either ; } else if (result instanceof PyInteger) { int value = result.asInt(); if (value < 0 || value > PySystemState.maxunicode) { throw Py.TypeError(String.format("character mapping must be in range(0x%x)", PySystemState.maxunicode + 1)); } buf.appendCodePoint(value); } else if (result instanceof PyUnicode) { buf.append(result.toString()); } else { // wrong return value throw Py.TypeError("character mapping must return integer, None or unicode"); } } return new PyUnicode(buf.toString()); } public static PyTuple charmap_encode(String str, String errors, PyObject mapping) { //Default to Latin-1 if (mapping == null) { return latin_1_encode(str, errors); } return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()), true); } private static PyTuple charmap_encode_internal(String str, String errors, PyObject mapping, StringBuilder v, boolean letLookupHandleError) { EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null; int size = str.length(); for (int i = 0; i < size; i++) { char ch = str.charAt(i); PyObject x; if (encodingMap != null) { int result = encodingMap.lookup(ch); if (result == -1) { x = null; } else { x = Py.newInteger(result); } } else { x = mapping.__finditem__(Py.newInteger(ch)); } if (x == null) { if (letLookupHandleError) { i = handleBadMapping(str, errors, mapping, v, size, i); } else { throw Py.UnicodeEncodeError("charmap", str, i, i + 1, "character maps to <undefined>"); } } else if (x instanceof PyInteger) { int value = ((PyInteger) x).getValue(); if (value < 0 || value > 255) { throw Py.TypeError("character mapping must be in range(256)"); } v.append((char) value); } else if (x instanceof PyString && !(x instanceof PyUnicode)) { v.append(x.toString()); } else if (x instanceof PyNone) { i = handleBadMapping(str, errors, mapping, v, size, i); } else { /* wrong return value */ throw Py.TypeError("character mapping must return " + "integer, None or str"); } } return encode_tuple(v.toString(), size); } private static int handleBadMapping(String str, String errors, PyObject mapping, StringBuilder v, int size, int i) { if (errors != null) { if (errors.equals(codecs.IGNORE)) { return i; } else if (errors.equals(codecs.REPLACE)) { charmap_encode_internal("?", errors, mapping, v, false); return i; } else if (errors.equals(codecs.XMLCHARREFREPLACE)) { charmap_encode_internal(codecs.xmlcharrefreplace(i, i + 1, str).toString(), errors, mapping, v, false); return i; } else if (errors.equals(codecs.BACKSLASHREPLACE)) { charmap_encode_internal(codecs.backslashreplace(i, i + 1, str).toString(), errors, mapping, v, false); return i; } } PyObject replacement = codecs.encoding_error(errors, "charmap", str, i, i + 1, "character maps to <undefined>"); String replStr = replacement.__getitem__(0).toString(); charmap_encode_internal(replStr, errors, mapping, v, false); return codecs.calcNewPosition(size, replacement) - 1; } public static PyTuple ascii_decode(String str) { return ascii_decode(str, null); } public static PyTuple ascii_decode(String str, String errors) { int size = str.length(); return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors), size); } public static PyTuple ascii_encode(String str) { return ascii_encode(str, null); } public static PyTuple ascii_encode(String str, String errors) { int size = str.length(); return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors), size); } /* --- Latin-1 Codec -------------------------------------------- */ public static PyTuple latin_1_decode(String str) { return latin_1_decode(str, null); } public static PyTuple latin_1_decode(String str, String errors) { int size = str.length(); return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors), size); } public static PyTuple latin_1_encode(String str) { return latin_1_encode(str, null); } public static PyTuple latin_1_encode(String str, String errors) { int size = str.length(); return encode_tuple(codecs.PyUnicode_EncodeLatin1(str, size, errors), size); } /* --- UTF16 Codec -------------------------------------------- */ public static PyTuple utf_16_encode(String str) { return utf_16_encode(str, null); } public static PyTuple utf_16_encode(String str, String errors) { return encode_tuple(encode_UTF16(str, errors, 0), str.length()); } public static PyTuple utf_16_encode(String str, String errors, int byteorder) { return encode_tuple(encode_UTF16(str, errors, byteorder), str.length()); } public static PyTuple utf_16_le_encode(String str) { return utf_16_le_encode(str, null); } public static PyTuple utf_16_le_encode(String str, String errors) { return encode_tuple(encode_UTF16(str, errors, -1), str.length()); } public static PyTuple utf_16_be_encode(String str) { return utf_16_be_encode(str, null); } public static PyTuple utf_16_be_encode(String str, String errors) { return encode_tuple(encode_UTF16(str, errors, 1), str.length()); } public static String encode_UTF16(String str, String errors, int byteorder) { final Charset utf16; if (byteorder == 0) { utf16 = Charset.forName("UTF-16"); } else if (byteorder == -1) { utf16 = Charset.forName("UTF-16LE"); } else { utf16 = Charset.forName("UTF-16BE"); } final ByteBuffer bbuf = utf16.encode(str); final StringBuilder v = new StringBuilder(bbuf.limit()); while (bbuf.remaining() > 0) { int val = bbuf.get(); if (val < 0) { val = 256 + val; } v.appendCodePoint(val); } return v.toString(); } public static PyTuple utf_16_decode(String str) { return utf_16_decode(str, null); } public static PyTuple utf_16_decode(String str, String errors) { return utf_16_decode(str, errors, false); } public static PyTuple utf_16_decode(String str, String errors, boolean final_) { int[] bo = new int[] { 0 }; int[] consumed = final_ ? null : new int[1]; return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length() : consumed[0]); } public static PyTuple utf_16_le_decode(String str) { return utf_16_le_decode(str, null); } public static PyTuple utf_16_le_decode(String str, String errors) { return utf_16_le_decode(str, errors, false); } public static PyTuple utf_16_le_decode(String str, String errors, boolean final_) { int[] bo = new int[] { -1 }; int[] consumed = final_ ? null : new int[1]; return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length() : consumed[0]); } public static PyTuple utf_16_be_decode(String str) { return utf_16_be_decode(str, null); } public static PyTuple utf_16_be_decode(String str, String errors) { return utf_16_be_decode(str, errors, false); } public static PyTuple utf_16_be_decode(String str, String errors, boolean final_) { int[] bo = new int[] { 1 }; int[] consumed = final_ ? null : new int[1]; return decode_tuple(decode_UTF16(str, errors, bo, consumed), final_ ? str.length() : consumed[0]); } public static PyTuple utf_16_ex_decode(String str) { return utf_16_ex_decode(str, null); } public static PyTuple utf_16_ex_decode(String str, String errors) { return utf_16_ex_decode(str, errors, 0); } public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder) { return utf_16_ex_decode(str, errors, byteorder, false); } public static PyTuple utf_16_ex_decode(String str, String errors, int byteorder, boolean final_) { int[] bo = new int[] { 0 }; int[] consumed = final_ ? null : new int[1]; String decoded = decode_UTF16(str, errors, bo, consumed); return new PyTuple(Py.newString(decoded), Py.newInteger(final_ ? str.length() : consumed[0]), Py.newInteger(bo[0])); } private static String decode_UTF16(String str, String errors, int[] byteorder) { return decode_UTF16(str, errors, byteorder, null); } private static String decode_UTF16(String str, String errors, int[] byteorder, int[] consumed) { int bo = 0; if (byteorder != null) { bo = byteorder[0]; } int size = str.length(); StringBuilder v = new StringBuilder(size / 2); int i; for (i = 0; i < size; i += 2) { char ch1 = str.charAt(i); if (i + 1 == size) { if (consumed != null) { break; } i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, i, i + 1, "truncated data"); continue; } char ch2 = str.charAt(i + 1); if (ch1 == 0xFE && ch2 == 0xFF) { bo = 1; continue; } else if (ch1 == 0xFF && ch2 == 0xFE) { bo = -1; continue; } int W1; if (bo == -1) { W1 = (ch2 << 8 | ch1); } else { W1 = (ch1 << 8 | ch2); } if (W1 < 0xD800 || W1 > 0xDFFF) { v.appendCodePoint(W1); continue; } else if (W1 >= 0xD800 && W1 <= 0xDBFF && i < size - 1) { i += 2; char ch3 = str.charAt(i); char ch4 = str.charAt(i + 1); int W2; if (bo == -1) { W2 = (ch4 << 8 | ch3); } else { W2 = (ch3 << 8 | ch4); } if (W2 >= 0xDC00 && W2 <= 0xDFFF) { int U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000; v.appendCodePoint(U); continue; } i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, i, i + 1, "illegal UTF-16 surrogate"); continue; } i = codecs.insertReplacementAndGetResume(v, errors, "utf-16", str, i, i + 1, "illegal encoding"); } if (byteorder != null) { byteorder[0] = bo; } if (consumed != null) { consumed[0] = i; } return v.toString(); } /* --- RawUnicodeEscape Codec ----------------------------------------- */ public static PyTuple raw_unicode_escape_encode(String str) { return raw_unicode_escape_encode(str, null); } public static PyTuple raw_unicode_escape_encode(String str, String errors) { return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str, errors, false), str.length()); } public static PyTuple raw_unicode_escape_decode(String str) { return raw_unicode_escape_decode(str, null); } public static PyTuple raw_unicode_escape_decode(String str, String errors) { return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str, errors), str.length()); } /* --- UnicodeEscape Codec -------------------------------------------- */ public static PyTuple unicode_escape_encode(String str) { return unicode_escape_encode(str, null); } public static PyTuple unicode_escape_encode(String str, String errors) { return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length()); } public static PyTuple unicode_escape_decode(String str) { return unicode_escape_decode(str, null); } public static PyTuple unicode_escape_decode(String str, String errors) { int n = str.length(); return decode_tuple(PyString.decode_UnicodeEscape(str, 0, n, errors, true), n); } /* --- UnicodeInternal Codec ------------------------------------------ */ public static PyTuple unicode_internal_encode(String str) { return unicode_internal_encode(str, null); } public static PyTuple unicode_internal_encode(String str, String errors) { return encode_tuple(str, str.length()); } public static PyTuple unicode_internal_decode(String str) { return unicode_internal_decode(str, null); } public static PyTuple unicode_internal_decode(String str, String errors) { return decode_tuple(str, str.length()); } /** * Optimized charmap encoder mapping. * * Uses a trie structure instead of a dictionary; the speedup primarily comes from not * creating integer objects in the process. The trie is created by inverting the * encoding map. */ @ExposedType(name = "EncodingMap", isBaseType = false) public static class EncodingMap extends PyObject { char[] level1; char[] level23; int count2; int count3; private EncodingMap(char[] level1, char[] level23, int count2, int count3) { this.level1 = level1; this.level23 = level23; this.count2 = count2; this.count3 = count3; } /** * Create and populate an EncodingMap from a 256 length PyUnicode char. Returns a * PyDictionary if the mapping isn't easily optimized. * * @param string a 256 length unicode mapping * @return an encoder mapping */ public static PyObject buildEncodingMap(PyObject string) { if (!(string instanceof PyUnicode) || string.__len__() != 256) { throw Py.TypeError("bad argument type for built-in operation"); } boolean needDict = false; char[] level1 = new char[32]; char[] level23 = new char[512]; int i; int count2 = 0; int count3 = 0; String decode = string.toString(); for (i = 0; i < level1.length; i++) { level1[i] = 0xFF; } for (i = 0; i < level23.length; i++) { level23[i] = 0xFF; } if (decode.charAt(0) != 0) { needDict = true; } for (i = 1; i < 256; i++) { int l1, l2; char charAt = decode.charAt(i); if (charAt == 0) { needDict = true; } if (charAt == 0xFFFE) { // unmapped character continue; } l1 = charAt >> 11; l2 = charAt >> 7; if (level1[l1] == 0xFF) { level1[l1] = (char)count2++; } if (level23[l2] == 0xFF) { level23[l2] = (char)count3++; } } if (count2 > 0xFF || count3 > 0xFF) { needDict = true; } if (needDict) { PyObject result = new PyDictionary(); for (i = 0; i < 256; i++) { result.__setitem__(Py.newInteger(decode.charAt(i)), Py.newInteger(i)); } return result; } // Create a three-level trie int length2 = 16 * count2; int length3 = 128 * count3; level23 = new char[length2 + length3]; PyObject result = new EncodingMap(level1, level23, count2, count3); for (i = 0; i < length2; i++) { level23[i] = 0xFF; } for (i = length2; i < length2 + length3; i++) { level23[i] = 0; } count3 = 0; for (i = 1; i < 256; i++) { int o1, o2, o3, i2, i3; char charAt = decode.charAt(i); if (charAt == 0xFFFE) { // unmapped character continue; } o1 = charAt >> 11; o2 = (charAt >> 7) & 0xF; i2 = 16 * level1[o1] + o2; if (level23[i2] == 0xFF) { level23[i2] = (char)count3++; } o3 = charAt & 0x7F; i3 = 128 * level23[i2] + o3; level23[length2 + i3] = (char)i; } return result; } /** * Lookup a char in the EncodingMap. * * @param c a char * @return an int, -1 for failure */ public int lookup(char c) { int l1 = c >> 11; int l2 = (c >> 7) & 0xF; int l3 = c & 0x7F; int i; if (c == 0) { return 0; } // level 1 i = level1[l1]; if (i == 0xFF) { return -1; } // level 2 i = level23[16 * i + l2]; if (i == 0xFF) { return -1; } // level 3 i = level23[16 * count2 + 128 * i + l3]; if (i == 0) { return -1; } return i; } } }