package client.net.sf.saxon.ce.functions.codenorm; import client.net.sf.saxon.ce.Configuration; import client.net.sf.saxon.ce.om.Axis; import client.net.sf.saxon.ce.om.DocumentInfo; import client.net.sf.saxon.ce.om.NodeInfo; import client.net.sf.saxon.ce.pattern.NodeKindTest; import client.net.sf.saxon.ce.trans.XPathException; import client.net.sf.saxon.ce.tree.iter.AxisIterator; import client.net.sf.saxon.ce.tree.util.StringTokenizer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * This class reads the data compiled into class UnicodeData, and builds hash tables * that can be used by the Unicode normalization routines. This operation is performed * once only, the first time normalization is attempted after Saxon is loaded. */ class UnicodeDataParserFromXML { // This class is never instantiated private UnicodeDataParserFromXML(){} /** * Called exactly once by NormalizerData to build the static data */ static NormalizerData build(Configuration config) throws XPathException { DocumentInfo doc = config.buildDocument("normalizationData.xml"); BitSet isExcluded = new BitSet(128000); BitSet isCompatibility = new BitSet(128000); NodeInfo canonicalClassKeys = null; NodeInfo canonicalClassValues = null; NodeInfo decompositionKeys = null; NodeInfo decompositionValues = null; AxisIterator iter = doc.iterateAxis(Axis.DESCENDANT, NodeKindTest.ELEMENT); while (true) { NodeInfo item = (NodeInfo)iter.next(); if (item == null) { break; } if (item.getLocalPart().equals("CanonicalClassKeys")) { canonicalClassKeys = item; } else if (item.getLocalPart().equals("CanonicalClassValues")) { canonicalClassValues = item; } else if (item.getLocalPart().equals("DecompositionKeys")) { decompositionKeys = item; } else if (item.getLocalPart().equals("DecompositionValues")) { decompositionValues = item; } else if (item.getLocalPart().equals("ExclusionList")) { readExclusionList(item.getStringValue(), isExcluded); } else if (item.getLocalPart().equals("CompatibilityList")) { readCompatibilityList(item.getStringValue(), isCompatibility); } } Map<Integer, Integer> canonicalClass = new HashMap<Integer, Integer>(400); readCanonicalClassTable(canonicalClassKeys.getStringValue(), canonicalClassValues.getStringValue(), canonicalClass); Map<Integer, String> decompose = new HashMap<Integer, String>(18000); Map<Integer, Integer> compose = new HashMap<Integer, Integer>(15000); readDecompositionTable(decompositionKeys.getStringValue(), decompositionValues.getStringValue(), decompose, compose, isExcluded, isCompatibility); return new NormalizerData(canonicalClass, decompose, compose, isCompatibility, isExcluded); } /** * Reads exclusion list and stores the data */ private static void readExclusionList(String s, BitSet isExcluded) { StringTokenizer st = new StringTokenizer(s); while (st.hasMoreTokens()) { String tok = st.nextToken(); int value = Integer.parseInt(tok, 32); isExcluded.set(value); } } /** * Reads compatibility list and stores the data */ private static void readCompatibilityList(String s, BitSet isCompatible) { StringTokenizer st = new StringTokenizer(s); while (st.hasMoreTokens()) { String tok = st.nextToken(); int value = Integer.parseInt(tok, 32); isCompatible.set(value); } } /** * Read canonical class table (mapping from character codes to their canonical class) */ private static void readCanonicalClassTable(String keyString, String valueString, Map<Integer, Integer> canonicalClasses) { ArrayList keys = new ArrayList(5000); StringTokenizer st = new StringTokenizer(keyString); while (st.hasMoreTokens()) { String tok = st.nextToken(); int value = Integer.parseInt(tok, 32); keys.add(Integer.valueOf(value)); } int k = 0; st = new StringTokenizer(valueString); while (st.hasMoreTokens()) { String tok = st.nextToken(); int clss; int repeat = 1; int star = tok.indexOf('*'); if (star < 0) { clss = Integer.parseInt(tok, 32); } else { repeat = Integer.parseInt(tok.substring(0, star)); clss = Integer.parseInt(tok.substring(star+1), 32); } for (int i=0; i<repeat; i++) { canonicalClasses.put(((Integer)keys.get(k++)).intValue(), clss); } } } /** * Read canonical class table (mapping from character codes to their canonical class) */ private static void readDecompositionTable( String decompositionKeyString, String decompositionValuesString, Map<Integer, String> decompose, Map<Integer, Integer> compose, BitSet isExcluded, BitSet isCompatibility) { int k = 0; List<String> values = new ArrayList<String>(1000); StringTokenizer st = new StringTokenizer(decompositionValuesString); while (st.hasMoreTokens()) { String tok = st.nextToken(); String value = ""; for (int c=0; c<tok.length();) { char h0 = tok.charAt(c++); char h1 = tok.charAt(c++); char h2 = tok.charAt(c++); char h3 = tok.charAt(c++); int code = ("0123456789abcdef".indexOf(h0)<<12) + ("0123456789abcdef".indexOf(h1)<<8) + ("0123456789abcdef".indexOf(h2)<<4) + ("0123456789abcdef".indexOf(h3)); // was <<12 value += (char)code; } values.add(value); } st = new StringTokenizer(decompositionKeyString); while (st.hasMoreTokens()) { String tok = st.nextToken(); int key = Integer.parseInt(tok, 32); String value = values.get(k++); decompose.put(key, value); // only compositions are canonical pairs // skip if script exclusion if (!isCompatibility.get(key) && !isExcluded.get(key)) { char first = '\u0000'; char second = value.charAt(0); if (value.length() > 1) { first = second; second = value.charAt(1); } // store composition pair in single integer int pair = (first << 16) | second; compose.put(pair, key); } } // Add algorithmic Hangul decompositions // This fragment code is copied from the normalization code published by Unicode consortium. // See module net.sf.saxon.serialize.codenorm.Normalizer for applicable copyright information. for (int SIndex = 0; SIndex < SCount; ++SIndex) { int TIndex = SIndex % TCount; char first, second; if (TIndex != 0) { // triple first = (char)(SBase + SIndex - TIndex); second = (char)(TBase + TIndex); } else { first = (char)(LBase + SIndex / NCount); second = (char)(VBase + (SIndex % NCount) / TCount); } int pair = (first << 16) | second; int key = SIndex + SBase; decompose.put(key, String.valueOf(first) + second); compose.put(pair, key); } } /** * Hangul composition constants */ private static final int SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, // 588 SCount = LCount * NCount; // 11172 } // This class has its origins in the normalization software published // by the Unicode Consortium. // Modified by Michael Kay (Saxonca) to change the way in which the data files are stored. // * Copyright (c) 1991-2005 Unicode, Inc. // * For terms of use, see http://www.unicode.org/terms_of_use.html // * For documentation, see UAX#15.<br> // * The Unicode Consortium makes no expressed or implied warranty of any // * kind, and assumes no liability for errors or omissions. // * No liability is assumed for incidental and consequential damages // * in connection with or arising out of the use of the information here.