/* * Copyright (c) 2005-2011 Grameen Foundation USA * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * See also http://www.apache.org/licenses/LICENSE-2.0.html for an * explanation of the license and how it is applied. */ package org.mifos.framework.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PushbackInputStream; import java.io.UnsupportedEncodingException; import java.io.Writer; /** * Original public domain code from <a href= * "http://tripoverit.blogspot.com/2007/04/javas-utf-8-and-unicode-writing-is.html" * >Trip over IT</a>. */ public class UnicodeUtil { /** * Convert data to given character set. * * @return decoded bytes. Includes BOM unless "ASCII" is used for * desiredOutputEncoding. */ public static byte[] convert(final byte[] bytes, final String desiredOutputEncoding) throws IOException { // Workaround for bug that will not be fixed by SUN // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4508058 UnicodeInputStream uis = new UnicodeInputStream(new ByteArrayInputStream(bytes), "ASCII"); boolean unicodeOutputReqd = (getBOM(desiredOutputEncoding).equals("")) ? false : true; String enc = uis.getEncoding(); String BOM = getBOM(enc); // get the BOM of the inputstream if ("".equals(BOM)) { // inputstream looks like ascii... // create a BOM based on the outputstream BOM = getBOM(desiredOutputEncoding); } uis.close(); ByteArrayOutputStream out = new ByteArrayOutputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(bytes, uis.getBomOffset(), bytes.length), enc)); Writer w = new BufferedWriter(new OutputStreamWriter(out, desiredOutputEncoding)); // dont write a BOM for ascii(out) as the OutputStreamWriter // will not process it correctly. if (!"".equals(BOM) && unicodeOutputReqd) { w.write(BOM); } char[] buffer = new char[4096]; int len; while (true) { len = br.read(buffer); if (len == -1) { break; } w.write(buffer, 0, len); } br.close(); // Close the input. w.close(); // Flush and close output. return out.toByteArray(); } public static String getBOM(final String enc) throws UnsupportedEncodingException { String result; if ("US-ASCII".equals(enc)) { // no bom required for ASCII result = ""; } else if ("UTF-8".equals(enc)) { byte[] bom = new byte[3]; bom[0] = (byte) 0xEF; bom[1] = (byte) 0xBB; bom[2] = (byte) 0xBF; result = new String(bom, enc); } else if ("UTF-16BE".equals(enc)) { byte[] bom = new byte[2]; bom[0] = (byte) 0xFE; bom[1] = (byte) 0xFF; result = new String(bom, enc); } else if ("UTF-16LE".equals(enc)) { byte[] bom = new byte[2]; bom[0] = (byte) 0xFF; bom[1] = (byte) 0xFE; result = new String(bom, enc); } else if ("UTF-32BE".equals(enc)) { byte[] bom = new byte[4]; bom[0] = (byte) 0x00; bom[1] = (byte) 0x00; bom[2] = (byte) 0xFE; bom[3] = (byte) 0xFF; result = new String(bom, enc); } else if ("UTF-32LE".equals(enc)) { byte[] bom = new byte[4]; bom[0] = (byte) 0x00; bom[1] = (byte) 0x00; bom[2] = (byte) 0xFF; bom[3] = (byte) 0xFE; result = new String(bom, enc); } else { throw new UnsupportedEncodingException("unknown encoding: " + enc); } return result; } /* justification: lazy. No good reason, just don't want to rewrite this. */ @SuppressWarnings("PMD.CyclomaticComplexity") public static class UnicodeInputStream extends InputStream { private final PushbackInputStream internalIn; private boolean isInited = false; private int bomOffset = -1; private final String fallbackEncoding; private String encoding; public static final int BOM_SIZE = 4; /** * WARNING: {@link #read()} behaves differently after <b>init()</b> is * called! * * @param fallbackEncoding * This encoding will be used if encoding cannot be detected * from byte-order mark. */ public UnicodeInputStream(final InputStream in, final String fallbackEncoding) { super(); internalIn = new PushbackInputStream(in, BOM_SIZE); this.fallbackEncoding = fallbackEncoding; } public String getFallbackEncoding() { return fallbackEncoding; } public String getEncoding() { if (!isInited) { try { init(); } catch (IOException ex) { throw new IllegalStateException("Init method failed.", ex); } } return encoding; } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are unread * back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (isInited) { return; } byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = fallbackEncoding; unread = n; } bomOffset = BOM_SIZE - unread; if (unread > 0) { internalIn.unread(bom, (n - unread), unread); } isInited = true; } @Override public void close() throws IOException { init(); isInited = true; internalIn.close(); } @Override public int read() throws IOException { init(); isInited = true; return internalIn.read(); } public int getBomOffset() { return bomOffset; } } public static BufferedReader getUnicodeAwareBufferedReader(String file) throws IOException { return getUnicodeAwareBufferedReader(new FileInputStream(file)); } public static BufferedReader getUnicodeAwareBufferedReader(InputStream stream) throws IOException { UnicodeInputStream in = new UnicodeInputStream(stream, System.getProperty("file.encoding")); return new BufferedReader(new InputStreamReader(in, in.getEncoding())); } }