package com.limegroup.gnutella.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
/**
* this class is used to create the data files required by
* I18NConvert and I18NData classes...
* more details can be found at www.unicode.org on codepoints, categories, etc.
* esp. UAX#15, UCD (Unicode Character Database Documentation)
*
* necessary files are : CaseFolding.txt, - from unicode.org
* MnKeep.txt, - created
* UnicodeData.txt - from unicode.org
* NormalizationTest-3.2.0.txt - from unicode.org
*/
public class UDataFileCreator {
private static final String DATA_DIR = "data/";
private static final String BUILD_DIR = DATA_DIR + "built/";
//created file names
private static final String NUDATA = "nudata.txt";
private static final String EXCLUDED_DAT = "excluded.dat";
private static final String CASE_MAP = "caseMap.dat";
private static final String REPLACE_SPACE = "replaceSpace.dat";
//data file names
//case mapping
private static final String CASE_FOLDING = "CaseFolding.txt";
//characters we don't want to exclude/replace even if they are punctuation
private static final String MN_KEEP = "MnKeep.txt";
//get all codepoints and categories
private static final String UNICODE_DATA = "UnicodeData.txt";
//get full deKomposition (nfkd)
private static final String NORMALIZATION_TEST = "NormalizationTest-3.2.0.txt";
/**
* @param args unused
*/
public static void main(String[] args) {
UDataFileCreator ufc = new UDataFileCreator();
ufc.createFile();
}
/**
*
*/
public void createFile() {
java.util.BitSet dontExclude = new java.util.BitSet();
Map codepoints = new TreeMap(new StringComparator());
HashMap caseMap = new HashMap();
java.util.BitSet excludedChars = new java.util.BitSet();
java.util.BitSet replaceWithSpace = new java.util.BitSet();
HashMap tempNFKC = new HashMap();
try {
readNonExclusion(dontExclude);
dealWithUnicodeData(codepoints,
dontExclude,
excludedChars,
replaceWithSpace);
readNTestPopKD(codepoints, tempNFKC);
readCaseFolding(caseMap);
replaceCase(codepoints, caseMap, excludedChars);
//all we need now is caseMap and excludedChars
writeOutObjects(codepoints,
caseMap,
excludedChars,
replaceWithSpace,
tempNFKC);
}
catch(IOException e) {
e.printStackTrace();
}
System.out.println("number of excluded code points : " + numEx);
}
/**
* write out object files
*/
private void writeOutObjects(Map codepoint,
Map caseMap,
java.util.BitSet excludedChars,
java.util.BitSet replaceWithSpace,
HashMap nfkc)
throws IOException {
FileOutputStream fo =
new FileOutputStream(new File(BUILD_DIR + NUDATA));
BufferedWriter bufo =
new BufferedWriter(new OutputStreamWriter(fo));
Iterator iter = codepoint.keySet().iterator();
while(iter.hasNext()) {
String s = (String)iter.next();
udata u = (udata)codepoint.get(s);
if(!u.deKomp.equals("")) {
bufo.write(s + ";");
String composition = (String)nfkc.get(u.deKomp);
composition = composition == null || composition.equals(u.deKomp)?"":composition;
bufo.write(u.deKomp + ";" + composition + ";\n");
}
}
bufo.flush();
bufo.close();
fo = new FileOutputStream(new File(BUILD_DIR+EXCLUDED_DAT));
ObjectOutputStream oo = new ObjectOutputStream(fo);
oo.writeObject(excludedChars);
fo = new FileOutputStream(new File(BUILD_DIR+CASE_MAP));
oo = new ObjectOutputStream(fo);
oo.writeObject(caseMap);
fo = new FileOutputStream(new File(BUILD_DIR+REPLACE_SPACE));
oo = new ObjectOutputStream(fo);
oo.writeObject(replaceWithSpace);
}
/**
* read in the file that lists code points that are to be kept even
* though they might belong to a category that should be excluded
* (ie. dakuon, etc)
*/
private void readNonExclusion(java.util.BitSet ex)
throws IOException {
//most Mn will be excluded but few will be kept like voiced marks
//this list may change
BufferedReader buf = getBR(DATA_DIR+MN_KEEP);
String line, codepoint;
String[] s;
int dI, start, end;
while((line = buf.readLine()) != null) {
dI = line.indexOf(';');
codepoint = (line.substring(0,dI)).trim();
//if the listed codepoint represents a range (ie. 3099..309A)
if(codepoint.indexOf("..") > -1) {
s = StringUtils.split(codepoint, "..");
start = Integer.parseInt(s[0],16) -1;
end = Integer.parseInt(s[1],16);
while(end != start)
ex.set(end--);
}
else
ex.set(Integer.parseInt(codepoint, 16));
}
buf.close();
}
/**
* read in the unicode data file to get a list of all the codepoints
* and to determine the category of these codepoints
*/
private void dealWithUnicodeData(Map cp,
java.util.BitSet dontExclude,
java.util.BitSet excluded,
java.util.BitSet replaceWithSpace)
throws IOException {
BufferedReader buf = getBR(DATA_DIR+UNICODE_DATA);
//file has codepoints below FFFD
String line;
boolean go = true;
while((line = buf.readLine()) != null && go)
go = processLine(cp, dontExclude, line, excluded, replaceWithSpace);
buf.close();
}
/**
* variable keeping track of number of excluded codepoints.
*/
int numEx = 0;
private boolean processLine(Map cp,
java.util.BitSet dontExclude,
String line,
java.util.BitSet excluded,
java.util.BitSet replaceWithSpace) {
String[] parts = StringUtils.splitNoCoalesce(line, ";");
if(parts[0].equals("FFEE")) return false;
if(parts[2].charAt(0) == 'P' || parts[2].equals("Zs")) {
//puctuations should be mostly be replaces with \u0020 (space).
//except for apostraphes
if(excludedPClass(parts[0])) {
numEx++;
excluded.set(Integer.parseInt(parts[0],16));
}
else if(!isExcluded(parts, dontExclude)) {
//not expluded
//put this codepoint into the cp map
udata u = new udata();
//populate the category for the data wrapper
u.cat = parts[2];
u.CC = parts[3];
cp.put(parts[0], u);
}
else {
replaceWithSpace.set(Integer.parseInt(parts[0], 16));
udata u = new udata();
//populate the category for the data wrapper
u.cat = parts[2];
u.CC = parts[3];
cp.put(parts[0], u);
}
}
else if(isExcluded(parts, dontExclude)) {
//put the codepoint into the excluded list
numEx++;
excluded.set(Integer.parseInt(parts[0],16));
}
else { //not expluded
//put this codepoint into the cp map
udata u = new udata();
//populate the category for the data wrapper
u.cat = parts[2];
u.CC = parts[3];
cp.put(parts[0], u);
}
return true;
}
/**
* check if this puctuation class should be excluded
*/
private boolean excludedPClass(String codepoint) {
//TODO: may need to add to this list of one
//for all the different languages
if (codepoint.equals("0027"))
return true;
return false;
}
/**
* check to see if code point in the array p
* is excluded.
*/
private boolean isExcluded(String[] p, java.util.BitSet ex) {
String cat = p[2];
String cc = p[3];
char first = cat.charAt(0);
if(ex.get(Integer.parseInt(p[0].trim(), 16)))
return false;
else if(cat.equals("Lu") ||
cat.equals("Ll") ||
cat.equals("Lt") ||
cat.equals("Lo") ||
cat.equals("Lm") ||
cat.equals("Nd") ||
cat.equals("Mc") ||
cat.equals("Cs") ||
cat.equals("Co") ||
cat.equals("Zs") ||
cat.equals("So") ||
first == 'P'
)
return false;
else if(cat.equals("Mn") && cc.equals("0")) {
//don't exclude Mn category which has a combining class of 0
return false;
}
else
return true;
}
/**
* read in the case folding file to find the correct
* case mappings from uppercase to lowercase
*/
private void readCaseFolding(Map c)
throws IOException {
BufferedReader buf = getBR(DATA_DIR+CASE_FOLDING);
String line, status;
String[] splitUp;
int index;
while((line = buf.readLine()) != null) {
if(line.length() > 0 &&
line.charAt(0) != '#') {
index = line.indexOf('#');
line = line.substring(0,index).trim();
splitUp = StringUtils.split(line, ";");
status = splitUp[1].trim();
//C - common case folding, F - full case folding
if(status.equals("C") || status.equals("F")) {
//c.put(splitUp[0].trim(), splitUp[2].trim());
c.put(code2char(splitUp[0].trim()), code2char(splitUp[2].trim()));
}
}
}
buf.close();
}
/**
* converts the hex representation of a String to a String
* ie. 0020 -> " "
* 0061 0062 -> "ab"
* @param s String to convert
* @return converted s
*/
private String code2char(String s) {
StringBuffer b = new StringBuffer();
if(s.indexOf(" ") > -1) {
String[] splitup = StringUtils.split(s, " ");
for(int i = 0; i < splitup.length; i++)
b.append((char)Integer.parseInt(splitup[i], 16));
}
else
b.append((char)Integer.parseInt(s, 16));
return b.toString();
}
/**
* reverse of code2char
* converts from String to hex rep
* ie. "ab" -> 0061 0062
*/
private String char2code(String s) {
if(s == null) return s;
StringBuffer b = new StringBuffer();
String temp;
for(int i = 0, n = s.length(); i < n; i++) {
temp = Integer.toString(s.charAt(i), 16);
if(temp.length() < 4) {
b.append("00");
b.append(temp);
}
else
b.append(temp);
b.append(" ");
}
return b.toString().trim();
}
/**
* run thru codepoints and replace case or replace with space 0020 if
* necessary
*/
private void replaceCase(Map codepoint, Map casF, java.util.BitSet ex) {
//run thru and check the codepoint or deKomp for uppercase
//this could probably done in the write out process?
Iterator iter = codepoint.keySet().iterator();
String code;
String up;
String[] splitUp;
//final int CJKLow = Integer.parseInt("3400", 16);
//final int CJKHigh = Integer.parseInt("9FA5", 16);
while(iter.hasNext()) {
code = (String)iter.next();
udata u = (udata)codepoint.get(code);
if(u.cat.indexOf("P") > -1 || u.cat.equals("Zs")) {
//replace all punctuation with (ascii space)
//and space (cat: Zs) with 0020 (ascii space)
u.deKomp = "0020";
}
else {
if(u.deKomp.equals("")) {
up = (String)casF.get(code2char(code));
if(up != null)
u.deKomp = char2code(up);
}
else {
StringBuffer dek = new StringBuffer();
splitUp = StringUtils.split(u.deKomp, " ");
//boolean removed = false;
for(int i = 0; i < splitUp.length; i++) {
//check if it should be removed...
int codeInt = Integer.parseInt(splitUp[i], 16);
if(!ex.get(codeInt)) {
up = char2code((String)casF.get(code2char(splitUp[i])));
if(up != null)
dek.append(up + " ");
else {
udata ud = (udata)codepoint.get(splitUp[i]);
String cat =
(ud == null)?"":ud.cat;
if(cat.indexOf("P") > -1)
up = "0020";
else
up = splitUp[i];
//dek.append(splitUp[i] + " ");
dek.append(up + " ");
}
}
}
u.deKomp = dek.toString().trim();
}
}
}
}
/**
* Reads through the NormalizationTest file provided by unicode.org
* to get the full decomposition of codepoints. UnicodeData file
* doesn't provide the full decomposition but its used to get
* the codepoints and combining classes... see NormalizationTest
* file for more info.
*/
private void readNTestPopKD(Map c, Map kc)
throws IOException {
//c - codepoints that weren't excluded...
BufferedReader buf = getBR(DATA_DIR+NORMALIZATION_TEST);
String line;
String[] parts;
char first;
boolean skip = false;
//int hangulFirst = 0xAC00;
//int hangulLast = 0xD7A3;
while((line = buf.readLine()) != null) {
first = line.charAt(0);
if(first != '#') {
if(first == '@') {
if(line.indexOf("Part2") > -1)
break;
else if(line.indexOf("Part0") > -1)
skip = true;
else
skip = false;
}
else {
if(!skip) {
line = line.substring(0, line.indexOf('#')).trim();
parts = StringUtils.split(line, ";");
udata u = (udata)c.get(parts[0].trim());
if(u != null)
u.deKomp = parts[4].trim();
//create a KC mapping to be used to
//build final data...
kc.put(parts[4].trim(), parts[3].trim());
}
}
}
}
buf.close();
}
private BufferedReader getBR(String filename)
throws IOException {
FileInputStream fi =
new FileInputStream(new File(filename));
return new BufferedReader(new InputStreamReader(fi));
}
//just a datawrapper to be used during the building of the files
private class udata {
/**
* Comment for <code>cat</code>
*/
public String cat;
/**
* Comment for <code>CC</code>
*/
public String CC;
/**
* Comment for <code>deKomp</code>
*/
public String deKomp = "";
}
}