package com.limegroup.gnutella.util;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.HashMap;
import java.util.Map;
import com.ibm.icu.text.Normalizer;
final class I18NConvertICU extends AbstractI18NConverter {
/** excluded codepoints (like accents) */
private java.util.BitSet _excluded;
/** certain chars to be replaced by space (like commas, etc) */
private java.util.BitSet _replaceWithSpace;
private Map _cMap;
/**
* initializer:
* this subclass of AbstractI18NConverter uses the icu4j's
* pacakges to normalize Strings.
* _excluded and _replaceWithSpace (BitSet) are read in from
* files created by UDataFileCreator and are used to
* remove accents, etc. and replace certain code points with
* ascii space (\u0020)
*/
I18NConvertICU()
throws IOException, ClassNotFoundException {
java.util.BitSet bs = null;
java.util.BitSet bs2 = null;
Map hm = null;
InputStream fi = CommonUtils.getResourceStream("excluded.dat");
//read in the explusion bitset
ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(fi));
bs = (java.util.BitSet)ois.readObject();
ois.close();
fi = CommonUtils.getResourceStream("caseMap.dat");
//read in the case map
ois = new ConverterObjectInputStream(new BufferedInputStream(fi));
hm = (HashMap)ois.readObject();
ois.close();
fi = CommonUtils.getResourceStream("replaceSpace.dat");
ois = new ObjectInputStream(new BufferedInputStream(fi));
bs2 = (java.util.BitSet)ois.readObject();
ois.close();
_excluded = bs;
_cMap = hm;
_replaceWithSpace = bs2;
}
/**
* Return the converted form of the string s
* this method will also split the s into the different
* unicode blocks
* @param s String to be converted
* @return the converted string
*/
public String getNorm(String s) {
return convert(s);
}
/**
* Simple composition of a String.
*/
public String compose(String s) {
return Normalizer.compose(s, false);
}
/**
* convert the string into NFKC + removal of accents, symbols, etc.
* uses icu4j's Normalizer to first decompose to NFKD form,
* then removes all codepoints in the exclusion BitSet
* finally composes to NFC and adds spaces '\u0020' between
* different unicode blocks
*
* @param String to convert
* @return converted String
*/
private String convert(String s) {
//decompose to NFKD
String nfkd = Normalizer.decompose(s, true);
StringBuffer buf = new StringBuffer();
int len = nfkd.length();
String lower;
char c;
//loop through the string and check for excluded chars
//and lower case if necessary
for(int i = 0; i < len; i++) {
c = nfkd.charAt(i);
if(_replaceWithSpace.get(c)) {
buf.append(" ");
}
else if(!_excluded.get(c)) {
lower = (String)_cMap.get(String.valueOf(c));
if(lower != null)
buf.append(lower);
else
buf.append(c);
}
}
//compose to nfc and split
return blockSplit(Normalizer.compose(buf.toString(), false));
}
}