/************************************************************************** * Copyright (c) 2001 by Punch Telematix. All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted provided that the following conditions * * are met: * * 1. Redistributions of source code must retain the above copyright * * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * * notice, this list of conditions and the following disclaimer in the * * documentation and/or other materials provided with the distribution. * * 3. Neither the name of Punch Telematix nor the names of * * other contributors may be used to endorse or promote products * * derived from this software without specific prior written permission.* * * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * * IN NO EVENT SHALL PUNCH TELEMATIX OR OTHER CONTRIBUTORS BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************/ /* ** $Id: RuleBasedCollator.java,v 1.1.1.1 2004/07/12 14:07:47 cvs Exp $ */ package java.text; import wonka.vm.IntArrayList; import wonka.vm.IntHashtable; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; public class RuleBasedCollator extends Collator { private static final long serialVersionUID = 2822366911447564107L; static { /*Runtime rt = Runtime.getRuntime(); System.out.println("freeMem "+rt.freeMemory()); long time = System.currentTimeMillis();*/ createHashtables(); /* time = System.currentTimeMillis() -time; System.out.println("time = "+time+", freeMem "+rt.freeMemory()); */ } private IntHashtable orders = new IntHashtable(); private String rules; private boolean accentInverseSorting; //defaults to false ... //the Rules form a linkedList static class Rule{ int[] order; IntHashtable composed; Rule next;//to Support reset } static { createHashtables(); } /** ** TODO: implement inverse accent ordering ('@') ** TODO: ignore chars if sequence start with ',' or ';' */ public RuleBasedCollator(String rules) throws ParseException{ this.rules = rules; rules = rules.trim(); int primary = 0; int secondary = 0; int tertiary = 0; int len = rules.length(); StringBuffer buf = new StringBuffer(); Rule prev = new Rule(); HashMap resetPointtable = new HashMap(); Rule resetPoint = null; for(int i = 0 ; i < len ; i++){ char ch = rules.charAt(i); //System.out.println("parsing char '"+ch+"'"); if(!Character.isWhitespace(ch)){ switch(ch){ case '<': if(resetPoint != null){ Rule rule = resetPoint; do { for(int k = 0 ; k < rule.order.length ; k++){ rule.order[k] += 0x00010000; } rule = rule.next; } while (rule != null); //secondary orders of next should be lowered rule = resetPoint; int order = (prev.order[0] & 0xffff0000)+0x00010000; while(order == (rule.order[0] & 0xffff0000)){ //System.out.println("lowering secondary rule"); rule.order[0] -= 0x00000100; for(int k = 1 ; k < rule.order.length ; k++){ if((rule.order[k] & 0xffff0000)== order){ rule.order[k] -= 0x00000100; } } rule = rule.next; if(rule == null){ break; } } } primary++; secondary = 0; tertiary = 0; break; case ';': if(resetPoint != null){ Rule rule = resetPoint; int order = prev.order[0] & 0xffff0000; while(order == (rule.order[0] & 0xffff0000)){ rule.order[0] += 0x00000100; for(int k = 1 ; k < rule.order.length ; k++){ if((rule.order[k] & 0xffff0000)== order){ rule.order[k] += 0x00000100; } } rule = rule.next; if(rule == null){ break; } } //lowering tertiary rule if needed rule = resetPoint; order = 0x00000100 + (prev.order[0] & 0xffffff00); while(order == (rule.order[0] & 0xffffff00)){ rule.order[0] += 0x00000001; for(int k = 1 ; k < rule.order.length ; k++){ if((rule.order[k] & 0xffffff00)== order){ rule.order[k] += 0x00000001; } } rule = rule.next; if(rule == null){ break; } } } secondary++; tertiary = 0; break; case ',': if(resetPoint != null){ Rule rule = resetPoint; int order = prev.order[0] & 0xffffff00; while(order == (rule.order[0] & 0xffffff00)){ rule.order[0] += 0x00000001; for(int k = 1 ; k < rule.order.length ; k++){ if((rule.order[k] & 0xffffff00)== order){ rule.order[k] += 0x00000001; } } rule = rule.next; if(rule == null){ break; } } } tertiary ++; case '=': break; case '&': //step 1 clean up previous resets prev.next = resetPoint; //that was easy ... //step 2 do the reset // there are 2 reset cases i = findEndOfArgument(rules, i, buf, len); decomposeChars(buf); String pattern = buf.toString(); Object o = resetPointtable.get(pattern); if(o != null){ //1 plain reset to existing 'single rule' Rule rule = (Rule)o; resetPoint = rule.next; int order = rule.order[0]; primary = (order & 0xffff0000)>>>16; secondary = (order & 0x0000ff00)>>8; tertiary = (order & 0x000000ff); //System.out.println("reset to '"+buf+" p = "+primary+" s = "+secondary+" t = "+tertiary); prev = rule; } else { //2 reset to a combined rule // first find a combination of existing chars LinkedList ll = findCombination(resetPointtable,pattern); if(ll == null) { throw new ParseException("invalid reset Point",i); } if(resetPoint != null){ //this is not the best way to locate the last rule!!! while(resetPoint.next != null){ resetPoint = resetPoint.next; } prev = resetPoint; int order = prev.order[0]; primary = (order & 0xffff0000)>>>16; secondary = (order & 0x0000ff00)>>8; tertiary = (order & 0x000000ff); resetPoint = null; } IntArrayList newOrder = new IntArrayList(7); Iterator it = ll.iterator(); while(it.hasNext()){ newOrder.add((int[])it.next()); } int[] array = newOrder.toArray(); while((++i) < len && Character.isWhitespace(rules.charAt(i))); if(i == len){ continue; } ch = rules.charAt(i++); switch(ch){ case '<': array[array.length-1] += 0x00010000; break; case ';': array[array.length-1] += 0x00000100; break; case ',': array[array.length-1] += 0x00000001; break; default: throw new ParseException("invalid reset Point",i); } i = findEndOfArgument(rules, i, buf, len); decomposeChars(buf); //System.out.println("character is '"+buf+"' size = "+buf.length()); Rule next = addToRuleTree(buf); next.order = array; resetPointtable.put(pattern, next); } continue; case '@': accentInverseSorting = true; continue; default: throw new ParseException("parsing of rules failed",i); } //create a new rule ... i = findEndOfArgument(rules, i, buf, len); decomposeChars(buf); //System.out.println("character is '"+buf+"' size = "+buf.length()); Rule next = addToRuleTree(buf); next.order = new int[1]; next.order[0] = (primary<<16) + (secondary<<8) + tertiary; prev.next = next; prev = next; resetPointtable.put(buf.toString(), next); //System.out.println("setting order to "+Integer.toHexString(next.order[0])); } } } private Rule addToRuleTree(StringBuffer buf) throws ParseException { int size = buf.length()-1; IntHashtable hash = orders; Rule next; for(int k = 0 ; k < size ; k++){ char ch = buf.charAt(k); Object o = hash.get(ch); if(o == null){ next = new Rule(); hash.put(ch, next); //System.out.println("putting '"+ch+"' in "+hash+", with rule "+next); hash = new IntHashtable(5); next.composed = hash; } else{ next = (Rule)o; if(next.composed == null){ next.composed = new IntHashtable(5); } hash = next.composed; } } char ch = buf.charAt(size); Object o = hash.get(ch); if(o == null){ next = new Rule(); hash.put(ch, next); //System.out.println("putting '"+ch+"' in "+hash+", with rule "+next); } else{ next = (Rule)o; if(next.order != null){ throw new ParseException("duplicate character",0); } } return next; } private int findEndOfArgument(String rules, int pos, StringBuffer buf, int len){ buf.setLength(0); while((++pos) < len && Character.isWhitespace(rules.charAt(pos))); for(; pos < len ; pos++){ char ch = rules.charAt(pos); if('\'' == ch && pos < len-2){//skip if(rules.charAt(pos+2) == '\''){ ch = rules.charAt(pos+1); pos += 2; } buf.append(ch); } else if((ch >= 0x0009 && ch <= 0x000d) || (ch >= 0x0020 && ch <= 0x002f) ||(ch >= 0x003a && ch <= 0x0040) || (ch >= 0x005b && ch <= 0x0060) ||(ch >= 0x007e && ch <= 0x007e) || ch == '@' || ch == '<' || ch == ';' || ch == ',' || ch == '&'){ return pos-1; } else { buf.append(ch); } } return pos; } public Object clone(){ return super.clone(); } public int compare(String one, String two){ CollationElementIterator cei1 = new CollationElementIterator(new StringCharacterIterator(one),this); CollationElementIterator cei2 = new CollationElementIterator(new StringCharacterIterator(two),this); int order1 = cei1.next(); while(order1 != CollationElementIterator.NULLORDER){ int order2 = cei2.next(); //System.out.println("order1: "+Integer.toHexString(order1)+", order2:"+Integer.toHexString(order2)); if(order2 == CollationElementIterator.NULLORDER){ return 1; } int mask; if(strength == PRIMARY){ mask = 0xffff0000; } else if(strength == SECONDARY){ mask = 0xffffff00; } else { mask = 0xffffffff; } order1 = mask & order1; order2 = mask & order2; if(order2 > order1){ return -1; } if(order1 > order2){ return 1; } order1 = cei1.next(); } return (CollationElementIterator.NULLORDER == cei2.next() ? 0 : -1); } public boolean equals(Object o){ if(!(o instanceof RuleBasedCollator)){ return false; } RuleBasedCollator rbc = (RuleBasedCollator) o; return this.decomposition == rbc.decomposition && this.strength == rbc.strength && this.rules.equals(rbc.rules); } public CollationElementIterator getCollationElementIterator(String src){ return new CollationElementIterator(new StringCharacterIterator(src),this); } public CollationElementIterator getCollationElementIterator(CharacterIterator src){ return new CollationElementIterator(src,this); } public CollationKey getCollationKey(String src){ StringBuffer key = new StringBuffer(src); decomposeChars(key); return new CollationKey(src,key.toString()); } public String getRules(){ return rules; } public int hashCode(){ return decomposition ^ strength ^ rules.hashCode(); } //CollationElementIterator interface ... int[] getOrders(CharacterIterator src){ //sort characters + check next based on Unicode properties StringBuffer buf = getOrderedCharacter(src); if(buf == null){ return null; } IntArrayList temp = new IntArrayList(5); /** ** for every character in the buffer we need to get it's order. (and add it to the IntArrayList) ** This could mean combining some characters (and possibly get some more chars ** to make special ordering combination dictated by the Collator rule) */ int len = buf.length(); for(int pos = 0 ; pos < len ;){ char ch = buf.charAt(pos++); //System.out.println("checking char '"+ch+"'"); Rule r = (Rule) orders.get(ch); if(r != null){ //combine chars if possible //System.out.println("char '"+ch+"' has a rule "+r); if(r.composed != null){ //System.out.println("char '"+ch+"' can be combined"); Rule resetRule = r; int resetPos = pos; StringBuffer resetBuf = buf; int resetIndex = src.getIndex(); while(r.composed != null){ if(pos >= len){ buf = getOrderedCharacter(src); if(buf == null){ break; } pos = 0; len = buf.length(); } r = (Rule)r.composed.get(buf.charAt(pos++)); if(r == null){ break; } if(r.order != null){ resetRule = r; resetPos = pos; resetBuf = buf; resetIndex = src.getIndex(); } } buf = resetBuf; pos = resetPos; r = resetRule; src.setIndex(resetIndex); len = buf.length(); } if (r.order != null){ //System.out.println("char '"+ch+"' adding ordering values "+Integer.toHexString(r.order[0])+" from "+r); temp.add(r.order); continue; } } //System.out.println("char '"+ch+"' adding default values"); temp.add(0x7fffffff); temp.add(ch<<16); } return temp.toArray(); } /** ** should decompose the char ch and check if the next character can be grouped with ** the already decomposed character(s). the correct ordering should be applied. ** This is all based on Unicode character properties ... */ private StringBuffer getOrderedCharacter(CharacterIterator src){ char ch = src.current(); if(ch == CharacterIterator.DONE){ return null; } StringBuffer buf = new StringBuffer(); buf.append(ch); decomposeChars(buf); ch = src.next(); while (ch != CharacterIterator.DONE && getCombiningClass(ch) != 0){ buf.append(ch); ch = src.next(); } //sort the orders ... int len = buf.length(); for(int i = 1 ; i < len ; i++){ ch = buf.charAt(i); int order = getCombiningClass(ch); //System.out.println("ORDERING char '"+Integer.toHexString(ch)+"' order = "+order+" at pos "+i); for(int j = i-1 ; j >= 0 ; j--){ char chr = buf.charAt(j); if(order >= getCombiningClass(chr)){ break; } buf.setCharAt(j, ch); buf.setCharAt(j+1, chr); } } return buf; } private LinkedList findCombination(HashMap table, String pattern){ //System.out.println("looking for '"+pattern+"' in "+table); int len = pattern.length(); if(len == 1){ //System.out.println("length == 1"); Rule rule = (Rule)table.get(pattern); //System.out.println("length == 1 '"+pattern+"' got "+rule); if(rule == null){ return null; } LinkedList ll = new LinkedList(); ll.addFirst(rule.order); return ll; } for(int i = 1 ; i <= len ; i++){ String sub = pattern.substring(0,i); Rule rule = (Rule)table.get(sub); //System.out.println("looking for '"+sub+"' got "+rule); if(rule != null){ LinkedList ll = findCombination(table, pattern.substring(i)); if(ll != null){ ll.addFirst(rule.order); return ll; } } } return null; } /** ** should take decomposition into account... */ private void decomposeChars(StringBuffer buf){ if(decomposition != NO_DECOMPOSITION){ for (int i = buf.length() - 1 ; i >= 0 ; i--){ String chars = decomposeChar(buf.charAt(i)); if(chars != null){ buf.deleteCharAt(i); buf.insert(i,chars); } } } } /** ** should take decomposition into account... */ private String decomposeChar(char ch){ if(strength != IDENTICAL){ /* TODO ... Object o = controls.get(ch); if(o != null){ return EMPTYCHAR; } */ //System.out.println("decomposeChar '"+ch+"'"); if(strength == FULL_DECOMPOSITION || getCompatibility(ch) == null){ return getDecomposition(ch); } } return null; } /* ** Set up the internal tables used by getCombiningClass, getDecomposition, ** and getCompatibility. */ private static native void createHashtables(); /* ** Get the canonical combining class for this character */ private static native int getCombiningClass(char c); /* ** Get the decomposition of this character. */ private static native String getDecomposition(char c); /* ** Get the compatibilty attribute of the decomposition. If the decomposition ** is canonical then this method returns null; else it returns a string such ** as "isolated" or "noBreak". */ private static native String getCompatibility(char c); }