/*
* Encodings.java - provide UTF8 and latin1 encodings and decodings.
*
* Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikokit.base.wikipedia.language;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
//import java.nio.charset.CharsetDecoder;
//import java.nio.charset.CharsetEncoder;
//import java.nio.charset.CharacterCodingException;
import java.io.UnsupportedEncodingException;
import java.util.Map;
public class Encodings {
private final static String EMPTY_STRING = "";
/** the database encoding, e.g.: ISO8859_1, Cp1251, UTF8 */
private String enc_db;
/** internal encoding, e.g.: UTF8, ISO8859_1, Cp1251 */
private String enc_int;
/** default internal encoding */
public static final String enc_int_default = "UTF8";
/** user encoding, e.g.: Cp1251, UTF8, ISO8859_1 */
private String enc_ui;
/** java source code encoding (it is used in junit tests), e.g.: Cp1251, UTF8, ISO8859_1 */
private String enc_java;
public static final String enc_java_default = "UTF8"; // Debian
//public static final String enc_java_default = "ISO8859_1"; // Debian
//public static final String enc_java_default = "Cp1251"; // Mandriva
public Encodings() {
enc_db = "ISO8859_1"; // Debian
//enc_db = "Cp1251"; // Mandriva
enc_int = enc_int_default;
//enc_ui = "UTF8";
enc_ui = "Cp1251"; // Mandriva ?
enc_java = enc_java_default;
}
/** Define the way of characters conversion via setting encoding of the database and encoding at the user side.
* @param database_encoding encoding of the database, e.g.: ISO8859_1 (default), Cp1251, UTF8
* @param internal_encoding internal encoding, e.g.: UTF8, ISO8859_1, Cp1251
* @param user_interface_encoding encoding at the user's side (user interface), e.g.: Cp1251 (default), ISO8859_1, UTF8
*/
public void SetEncodings (String database_encoding, String internal_encoding, String user_interface_encoding) {
enc_db = database_encoding;
enc_int = internal_encoding;
enc_ui = user_interface_encoding;
}
public void SetEncodingJavaSourceCode (String e) {
enc_java = e;
}
/** Define the way of characters conversion via setting encoding of the database and encoding at the user side.
* @param user_interface_encoding_source encoding at the user's side (user interface), e.g.: Cp1251 (default), ISO8859_1, UTF8
* @param database_encoding_dest encoding of the database, e.g.: ISO8859_1 (default), Cp1251, UTF8
*/ /*
public void SetEncodingUserToDB (String user_interface_encoding_source, String database_encoding_dest) {
ui_source = user_interface_encoding_source;
db_dest = database_encoding_dest;
}*/
/** Gets encoding of the database */
public String GetDBEnc (){ return enc_db; }
/** Gets internal encoding */
public String GetInternalEnc (){ return enc_int; };
/** Gets encoding at the user's side (user interface) */
public String GetUserEnc (){ return enc_ui; };
/** Gets java sources encoding */
public String GetJavaEnc (){ return enc_java; };
/** Convert string from database to internal encoding */
public String EncodeFromDB(String text) {
return FromTo(text, enc_db, enc_int);
}
/** Convert string from internal encoding to database */
public String EncodeToDB(String text) {
return FromTo(text, enc_int, enc_db);
}
/** Convert string from user to internal encoding */
public String EncodeFromUser(String text) {
return FromTo(text, enc_ui, enc_int);
}
/** Convert string from internal to user encoding */
public String EncodeToUser(String text) {
return FromTo(text, enc_int, enc_ui);
}
/** Convert string from Java sources to internal encoding */
public String EncodeFromJava(String text) {
return FromTo(text, enc_java, enc_int);
}
// Static functions
public static String bytesToUTF8(byte[] bytes) {
return bytesTo(bytes, "UTF8");
}
//FromTo("text", "UTF8", "ISO8859_1");
public static String FromTo(String text, String encode_from, String encode_to) {
try {
if(null == text || 0 == text.length()) { return EMPTY_STRING;
}
byte[] b = text.getBytes(encode_from);
return new String(b, encode_to);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return EMPTY_STRING;
}
}
// choose an encoding
/*private static final Charset latin1 = Charset.forName( "ISO-8859-1" );
private static final CharsetDecoder latin1_decoder = latin1.newDecoder(); // for byte to char
private static final CharsetEncoder latin1_encoder = latin1.newEncoder(); // for char to byte
private static final Charset cp1251 = Charset.forName( "Cp1251" );
private static final CharsetDecoder cp1251_decoder = latin1.newDecoder();
private static final CharsetEncoder cp1251_encoder = latin1.newEncoder();
private static final Charset utf8 = Charset.forName( "UTF8" );
private static final CharsetDecoder utf8_decoder = latin1.newDecoder();
private static final CharsetEncoder utf8_encoder = latin1.newEncoder();
*/
// effectively convert byte[] to char[] after a read
//CharBuffer charBuffer = decoder.decode( byteBuffer );
// effectively convert char[] to byte[] before a write
//ByteBuffer byteBuffer = encoder.encode( charBuffer );
private static int len = 1024;
private static CharBuffer cb = CharBuffer.allocate(len);
private static ByteBuffer bb = ByteBuffer.allocate(len);
//public static StringBuffer sb = new StringBuffer (1024);
//public static String FromToFast(String text, String encode_from, String encode_to)
/*
public static String FromToFast(String text, EncodingType et_from, EncodingType et_to)
{
try {
if(null == text) { return EMPTY_STRING;
}
if (text.length() >= len) {
throw new IndexOutOfBoundsException(
"invalid index");
}*/
/*
CharsetEncoder e = null;
if(encode_from.equals("ISO8859_1")) {
e = latin1_encoder;
} else if(encode_from.equals("Cp1251")) {
e = cp1251_encoder;
} else if(encode_from.equals("UTF8")) {
e = utf8_encoder;
}
CharsetDecoder d = null;
if(encode_to.equals("ISO8859_1")) {
d = latin1_decoder;
} else if(encode_to.equals("Cp1251")) {
d = cp1251_decoder;
} else if(encode_to.equals("UTF8")) {
d = utf8_decoder;
}*/
/*
//byte[] b = text.getBytes(encode_from);
bb.put( et_from.getEncoder().encode( cb.put(text) ) );
cb.rewind();
cb.put( et_to.getDecoder().decode( bb ) );
bb.rewind();
//return new String(b, encode_to);
return cb.toString();
//} catch (UnsupportedEncodingException e) {
} catch (CharacterCodingException e) {
e.printStackTrace();
return EMPTY_STRING;
}
}*/
public static String bytesTo(byte[] bytes, String encode) {
try {
return new String(bytes, encode);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return EMPTY_STRING;
}
}
public static String UTF8ToLatin1(String str) {
try {
if(null == str) { return EMPTY_STRING;
}
byte[] bytes = str.getBytes();
return new String(bytes, "ISO8859_1");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return EMPTY_STRING;
}
}
public static String UTF8ToCp1251(String str) {
try {
if(null == str) { return EMPTY_STRING;
}
byte[] bytes = str.getBytes();
return new String(bytes, "Cp1251");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return EMPTY_STRING;
}
}
public static String Latin1ToUTF8(String str) {
try {
if(null == str) { return EMPTY_STRING;
}
byte[] bytes = str.getBytes();
return new String(bytes, "UTF8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return EMPTY_STRING;
}
}
/** Prints available encodings to stdout */
public static void printEncodings() {
Map availcs = Charset.availableCharsets();
for (Object o:availcs.keySet()) {
System.out.println(o);
}
}
/** Gets available encodings */
public static Map getEncodings() {
return Charset.availableCharsets();
}
}