UDataFileCreator.java example

Explorer
learning-bittorrent-master
package com.limegroup.gnutella.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;


/**
 * this class is used to create the data files required by
 * I18NConvert and I18NData classes...
 * more details can be found at www.unicode.org on codepoints, categories, etc.
 * esp. UAX#15, UCD (Unicode Character Database Documentation)
 *
 * necessary files are : CaseFolding.txt, - from unicode.org
 *                       MnKeep.txt, - created
 *                       UnicodeData.txt - from unicode.org
 *                       NormalizationTest-3.2.0.txt - from unicode.org
 */
public class UDataFileCreator {

    private static final String DATA_DIR = "data/";
    private static final String BUILD_DIR = DATA_DIR + "built/";
    
    //created file names
    private static final String NUDATA = "nudata.txt";
    private static final String EXCLUDED_DAT = "excluded.dat";
    private static final String CASE_MAP = "caseMap.dat";
    private static final String REPLACE_SPACE = "replaceSpace.dat";
    
    //data file names
    //case mapping
    private static final String CASE_FOLDING = "CaseFolding.txt";
    //characters we don't want to exclude/replace even if they are punctuation
    private static final String MN_KEEP = "MnKeep.txt";
    //get all codepoints and categories
    private static final String UNICODE_DATA = "UnicodeData.txt";
    //get full deKomposition (nfkd)
    private static final String NORMALIZATION_TEST = "NormalizationTest-3.2.0.txt";
    

    /**
     * @param args unused
     */
    public static void main(String[] args) {
        UDataFileCreator ufc = new UDataFileCreator();
        ufc.createFile();
    }
    
    /**
     * 
     */
    public void createFile() {
        java.util.BitSet dontExclude = new java.util.BitSet();
        Map codepoints = new TreeMap(new StringComparator());

        HashMap caseMap = new HashMap();
        java.util.BitSet excludedChars = new java.util.BitSet();
        java.util.BitSet replaceWithSpace = new java.util.BitSet();

        HashMap tempNFKC = new HashMap();
        

        try {
            readNonExclusion(dontExclude);
            dealWithUnicodeData(codepoints, 
				dontExclude, 
				excludedChars,
				replaceWithSpace);
            readNTestPopKD(codepoints, tempNFKC);
            readCaseFolding(caseMap);
            replaceCase(codepoints, caseMap, excludedChars);
            //all we need now is caseMap and excludedChars
            writeOutObjects(codepoints, 
                            caseMap, 
                            excludedChars, 
                            replaceWithSpace,
                            tempNFKC);
        }
        catch(IOException e) {
            e.printStackTrace();
        }
        
        System.out.println("number of excluded code points : " + numEx);
    }

    /**
     * write out object files
     */
    private void writeOutObjects(Map codepoint,
                                 Map caseMap, 
                                 java.util.BitSet excludedChars,
                                 java.util.BitSet replaceWithSpace,
                                 HashMap nfkc) 
        throws IOException {

        FileOutputStream fo = 
            new FileOutputStream(new File(BUILD_DIR + NUDATA));

        BufferedWriter bufo =
            new BufferedWriter(new OutputStreamWriter(fo));
        
        Iterator iter = codepoint.keySet().iterator();
        while(iter.hasNext()) {
            String s = (String)iter.next();
            udata u = (udata)codepoint.get(s);

            if(!u.deKomp.equals("")) {
                bufo.write(s + ";");

                String composition = (String)nfkc.get(u.deKomp);
                composition = composition == null || composition.equals(u.deKomp)?"":composition;
                bufo.write(u.deKomp + ";" + composition + ";\n");
            }
        }
        
        bufo.flush();
        bufo.close();
        
        fo = new FileOutputStream(new File(BUILD_DIR+EXCLUDED_DAT));
        ObjectOutputStream oo = new ObjectOutputStream(fo);
        oo.writeObject(excludedChars);
        
        fo = new FileOutputStream(new File(BUILD_DIR+CASE_MAP));
        oo = new ObjectOutputStream(fo);
        oo.writeObject(caseMap);
        
        fo = new FileOutputStream(new File(BUILD_DIR+REPLACE_SPACE));
        oo = new ObjectOutputStream(fo);
        oo.writeObject(replaceWithSpace);
    
    }

    /**
     * read in the file that lists code points that are to be kept even
     * though they might belong to a category that should be excluded
     * (ie. dakuon, etc)
     */
    private void readNonExclusion(java.util.BitSet ex) 
        throws IOException {
        //most Mn will be excluded but few will be kept like voiced marks
        //this list may change
        BufferedReader buf = getBR(DATA_DIR+MN_KEEP);
        String line, codepoint;
        String[] s;
        int dI, start, end;

        while((line = buf.readLine()) != null) {
            dI = line.indexOf(';');
            codepoint = (line.substring(0,dI)).trim();
            
            //if the listed codepoint represents a range (ie. 3099..309A)
            if(codepoint.indexOf("..") > -1) {
                s = StringUtils.split(codepoint, "..");
                start = Integer.parseInt(s[0],16) -1;
                end = Integer.parseInt(s[1],16);
                while(end != start)
                    ex.set(end--);
            }
            else
                ex.set(Integer.parseInt(codepoint, 16));
        }
        buf.close();
    }
    

    /**
     * read in the unicode data file to get a list of all the codepoints
     * and to determine the category of these codepoints
     */
    private void dealWithUnicodeData(Map cp, 
				     java.util.BitSet dontExclude, 
				     java.util.BitSet excluded,
				     java.util.BitSet replaceWithSpace) 
        throws IOException {
        
        BufferedReader buf = getBR(DATA_DIR+UNICODE_DATA); 
        //file has codepoints below FFFD
        String line;
        boolean go = true;

        while((line = buf.readLine()) != null && go) 
            go = processLine(cp, dontExclude, line, excluded, replaceWithSpace);
        
        buf.close();
    }

    /**
     * variable keeping track of number of excluded codepoints.
     */
    int numEx = 0;
    
    private boolean processLine(Map cp, 
                                java.util.BitSet dontExclude, 
                                String line, 
                                java.util.BitSet excluded,
                                java.util.BitSet replaceWithSpace) {
        String[] parts = StringUtils.splitNoCoalesce(line, ";");
        if(parts[0].equals("FFEE")) return false;
        if(parts[2].charAt(0) == 'P' || parts[2].equals("Zs")) {
            //puctuations should be mostly be replaces with \u0020 (space).
            //except for apostraphes
            if(excludedPClass(parts[0])) {
                numEx++;
                excluded.set(Integer.parseInt(parts[0],16));
            }
            else if(!isExcluded(parts, dontExclude)) {
                //not expluded
                //put this codepoint into the cp map
                udata u = new udata();
                //populate the category for the data wrapper
                u.cat = parts[2];
                u.CC = parts[3];
                cp.put(parts[0], u);
            }
            else {
                replaceWithSpace.set(Integer.parseInt(parts[0], 16));
                udata u = new udata();
                //populate the category for the data wrapper
                u.cat = parts[2];
                u.CC = parts[3];
                cp.put(parts[0], u);
            }
        }
        else if(isExcluded(parts, dontExclude)) {
            //put the codepoint into the excluded list
            numEx++;
            excluded.set(Integer.parseInt(parts[0],16));
        }
        else { //not expluded
            //put this codepoint into the cp map
            udata u = new udata();
            //populate the category for the data wrapper
            u.cat = parts[2];
            u.CC = parts[3];
            cp.put(parts[0], u);
        }
        return true;
    }
    
    /**
     * check if this puctuation class should be excluded
     */
    private boolean excludedPClass(String codepoint) {
        //TODO: may need to add to this list of one
        //for all the different languages
        if (codepoint.equals("0027"))
            return true;
        return false;
    }

    /**
     * check to see if code point in the array p 
     * is excluded.  
     */
    private boolean isExcluded(String[] p, java.util.BitSet ex) {
        String cat = p[2];
        String cc = p[3];
        char first = cat.charAt(0);
        if(ex.get(Integer.parseInt(p[0].trim(), 16)))
            return false;
        else if(cat.equals("Lu") ||
                cat.equals("Ll") ||
                cat.equals("Lt") ||
                cat.equals("Lo") ||
                cat.equals("Lm") ||
                cat.equals("Nd") ||
                cat.equals("Mc") ||
                cat.equals("Cs") ||
                cat.equals("Co") ||
                cat.equals("Zs") ||
                cat.equals("So") ||
                first == 'P' 
                )
            return false;
        else if(cat.equals("Mn") && cc.equals("0")) {
            //don't exclude Mn category which has a combining class of 0
            return false;
        }
        else
            return true;
    }

    /**
     * read in the case folding file to find the correct
     * case mappings from uppercase to lowercase
     */
    private void readCaseFolding(Map c) 
        throws IOException {
        BufferedReader buf = getBR(DATA_DIR+CASE_FOLDING);
        String line, status;
        String[] splitUp;
        int index;
        
        while((line = buf.readLine()) != null) {
            if(line.length() > 0 &&
               line.charAt(0) != '#') {
                index = line.indexOf('#');
                line = line.substring(0,index).trim();
                splitUp = StringUtils.split(line, ";");
                status = splitUp[1].trim();
                //C - common case folding, F - full case folding
                if(status.equals("C") || status.equals("F")) {
                    //c.put(splitUp[0].trim(), splitUp[2].trim());
                    c.put(code2char(splitUp[0].trim()), code2char(splitUp[2].trim()));
                }
            }
        }
        
        buf.close();
    }

    /**
     * converts the hex representation of a String to a String
     * ie. 0020 -> " "
     *     0061 0062 -> "ab"
     * @param s String to convert
     * @return converted s
     */
    private String code2char(String s) {
        StringBuffer b = new StringBuffer();
        
        if(s.indexOf(" ") > -1) {
            String[] splitup = StringUtils.split(s, " ");
            for(int i = 0; i < splitup.length; i++) 
                b.append((char)Integer.parseInt(splitup[i], 16));
        }
        else
            b.append((char)Integer.parseInt(s, 16));
        
        return b.toString();
    }

    /**
     * reverse of code2char
     * converts from String to hex rep
     * ie. "ab" -> 0061 0062
     */
    private String char2code(String s) {
        if(s == null) return s;
        StringBuffer b = new StringBuffer();
        String temp;
        for(int i = 0, n = s.length(); i < n; i++) {
            temp = Integer.toString(s.charAt(i), 16);
            if(temp.length() < 4) {
                b.append("00");
                b.append(temp);
            }
            else
                b.append(temp);
            
            b.append(" ");
        }
        
        return b.toString().trim();
    }

    /**
     * run thru codepoints and replace case or replace with space 0020 if
     * necessary
     */
    private void replaceCase(Map codepoint, Map casF, java.util.BitSet ex) {
        //run thru and check the codepoint or deKomp for uppercase
        //this could probably done in the write out process?
        Iterator iter = codepoint.keySet().iterator();
        String code;
        String up;
        String[] splitUp;
        //final int CJKLow = Integer.parseInt("3400", 16);
        //final int CJKHigh = Integer.parseInt("9FA5", 16);
        while(iter.hasNext()) {
            code = (String)iter.next();
            udata u = (udata)codepoint.get(code);
            if(u.cat.indexOf("P") > -1 || u.cat.equals("Zs")) {
                //replace all punctuation with (ascii space)
                //and space (cat: Zs) with 0020 (ascii space)
                u.deKomp = "0020";
            }
            else {

                if(u.deKomp.equals("")) {
                    up = (String)casF.get(code2char(code));
                    if(up != null)
                        u.deKomp = char2code(up);
                }
                else {
                    StringBuffer dek = new StringBuffer();
                    splitUp = StringUtils.split(u.deKomp, " ");
                    //boolean removed = false;
                    for(int i = 0; i < splitUp.length; i++) {
                        //check if it should be removed...
                        int codeInt = Integer.parseInt(splitUp[i], 16);

                        if(!ex.get(codeInt)) {
                            up = char2code((String)casF.get(code2char(splitUp[i])));
                            if(up != null)
                                dek.append(up + " ");
                            else {
                                udata ud = (udata)codepoint.get(splitUp[i]);
                                String cat = 
                                    (ud == null)?"":ud.cat;
                                if(cat.indexOf("P") > -1)
                                    up = "0020";
                                else
                                    up = splitUp[i];
                                //dek.append(splitUp[i] + " ");
                                dek.append(up + " ");
                            }
                        }
                    }
                    u.deKomp = dek.toString().trim();
                }
                
            }
        }
    }

    /**
     * Reads through the NormalizationTest file provided by unicode.org
     * to get the full decomposition of codepoints.  UnicodeData file
     * doesn't provide the full decomposition but its used to get
     * the codepoints and combining classes... see NormalizationTest
     * file for more info.
     */
    private void readNTestPopKD(Map c, Map kc) 
        throws IOException {
        //c - codepoints that weren't excluded...
        BufferedReader buf = getBR(DATA_DIR+NORMALIZATION_TEST);

        String line;
        String[] parts;
        char first;
        boolean skip = false;

        //int hangulFirst = 0xAC00;
        //int hangulLast = 0xD7A3;

        while((line = buf.readLine()) != null) {
            first = line.charAt(0);
            if(first != '#') {
                if(first == '@') {
                    if(line.indexOf("Part2") > -1)
                        break;
                    else if(line.indexOf("Part0") > -1)
                        skip = true;
                    else
                        skip = false;
                }
                else {
                    if(!skip) {
                        line = line.substring(0, line.indexOf('#')).trim();
                        parts = StringUtils.split(line, ";");
                        udata u = (udata)c.get(parts[0].trim());

                        if(u != null) 
                            u.deKomp = parts[4].trim();
                        //create a KC mapping to be used to
                        //build final data... 
                        kc.put(parts[4].trim(), parts[3].trim());
                    }
                }
            }   
        }
        
        buf.close();
    }

    private BufferedReader getBR(String filename) 
        throws IOException {

        FileInputStream fi = 
            new FileInputStream(new File(filename));
        return new BufferedReader(new InputStreamReader(fi));

    }
    
    //just a datawrapper to be used during the building of the files
    private class udata {
        /**
         * Comment for <code>cat</code>
         */
        public String cat;
        /**
         * Comment for <code>CC</code>
         */
        public String CC;
        /**
         * Comment for <code>deKomp</code>
         */
        public String deKomp = "";
    }

}