I18NConvertICU.java example

Explorer
learning-bittorrent-master
package com.limegroup.gnutella.util;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.HashMap;
import java.util.Map;

import com.ibm.icu.text.Normalizer;

final class I18NConvertICU extends AbstractI18NConverter {

    /** excluded codepoints (like accents) */
    private java.util.BitSet _excluded;
    /** certain chars to be replaced by space (like commas, etc) */
    private java.util.BitSet _replaceWithSpace;
    private Map _cMap;

    /**
     * initializer:
     * this subclass of AbstractI18NConverter uses the icu4j's 
     * pacakges to normalize Strings.  
     * _excluded and _replaceWithSpace (BitSet) are read in from
     * files created by UDataFileCreator and are used to 
     * remove accents, etc. and replace certain code points with
     * ascii space (\u0020)
     */
    I18NConvertICU()
        throws IOException, ClassNotFoundException {
    	java.util.BitSet bs = null;
        java.util.BitSet bs2 = null;
    	Map hm = null;

        InputStream fi = CommonUtils.getResourceStream("excluded.dat");
        //read in the explusion bitset
        ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(fi));
        bs = (java.util.BitSet)ois.readObject();
        ois.close();
        
        fi = CommonUtils.getResourceStream("caseMap.dat");
        //read in the case map
        ois = new ConverterObjectInputStream(new BufferedInputStream(fi));
        hm = (HashMap)ois.readObject();
        ois.close();
        
        fi = CommonUtils.getResourceStream("replaceSpace.dat");
        ois = new ObjectInputStream(new BufferedInputStream(fi));
        bs2 = (java.util.BitSet)ois.readObject();
        ois.close();

    	_excluded = bs;
    	_cMap = hm;
        _replaceWithSpace = bs2;
    }
    
    /**
     * Return the converted form of the string s
     * this method will also split the s into the different
     * unicode blocks
     * @param s String to be converted
     * @return the converted string
     */
    public String getNorm(String s) {
        return convert(s);
    } 
    
    /**
     * Simple composition of a String.
     */
    public String compose(String s) {
        return Normalizer.compose(s, false);
    }
    
    /**
     * convert the string into NFKC + removal of accents, symbols, etc.
     * uses icu4j's Normalizer to first decompose to NFKD form,
     * then removes all codepoints in the exclusion BitSet 
     * finally composes to NFC and adds spaces '\u0020' between
     * different unicode blocks
     *
     * @param String to convert
     * @return converted String
     */
    private String convert(String s) {
    	//decompose to NFKD
    	String nfkd = Normalizer.decompose(s, true);
    	StringBuffer buf = new StringBuffer();
    	int len = nfkd.length();
    	String lower;
    	char c;
    
    	//loop through the string and check for excluded chars
    	//and lower case if necessary
    	for(int i = 0; i < len; i++) {
    	    c = nfkd.charAt(i);
            if(_replaceWithSpace.get(c)) {
                buf.append(" ");
            }
    	    else if(!_excluded.get(c)) {
                lower = (String)_cMap.get(String.valueOf(c));
                if(lower != null)
                    buf.append(lower);
                else
                    buf.append(c);
    	    }
    	}
    	
    	//compose to nfc and split
    	return blockSplit(Normalizer.compose(buf.toString(), false));
    }

}