package net.contrapunctus.rngzip.util; import com.sun.msv.grammar.Grammar; import com.sun.msv.util.StringPair; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.zip.Adler32; import java.util.zip.CheckedOutputStream; import java.util.zip.Checksum; import org.kohsuke.bali.Driver; import org.kohsuke.bali.automaton.*; import org.kohsuke.bali.automaton.builder.TreeAutomatonBuilder; import org.kohsuke.bali.optimizer.*; import org.kohsuke.bali.writer.AutomatonWriter; /** * This class encapsulates the <code>TreeAutomaton</code> from the * Bali library by Kohsuke Kawaguchi. It hides some of the details of * <code>State</code> and <code>Transition</code> classes, making them * easier to use for our application. * * <p>One of the most important aspects of this class is that it * imposes a particular ordering on the transitions based on their * alphabets. This is critical because the compressor and * decompressor must agree on the ordering of transitions from each * state in the automaton. The Bali implementation uses maps and * sets, which do not maintain reliable transition orderings between * different runs. States are numbered beginning with zero, and * transitions leaving each state are numbered similarly. They * include, however, the ‘epsilon’ transitions. * * <p>For efficiency, Bali maps qualified names to integers. This * class also contains methods <code>encodeName</code> and * <code>decodeName</code> to provide convenient access to that * mapping. * * <p class='license'>This is free software; you may modify and/or * redistribute it under the terms of the GNU General Public License, * but it comes with <b>absolutely no warranty.</b> * * @author Christopher League * @see TreeAutomaton * @see State * @see Transition */ public final class BaliAutomaton { private final URL url; private final TreeAutomaton au; private Transition[][] trans; private State[] states; private HashMap<Integer,String> names = new HashMap<Integer,String>(); private TransitionSorter ts = new TransitionSorter(); public static BaliAutomaton fromRNG (URL url) throws SchemaFormatException { Grammar gr = Driver.loadRELAXNGGrammar(url); if( gr == null ) throw new SchemaFormatException(url.toString()); gr = Unifier.unify(gr); gr = ZeroOrMoreAttributeExpander.optimize(gr); gr = InterleaveStrengthReducer.optimize(gr); gr = AttributeReorder.optimize(gr); TreeAutomaton ta = TreeAutomatonBuilder.build(gr, false, true, true); return new BaliAutomaton(url, ta); } /** * Build an automaton by reading the named Relax NG schema file, * and encapsulate it. * @param filename path of a RelaxNG schema file (<code>.rng</code> * XML format) * @throws FileNotFoundException if the specified filename does not * exist * @throws SchemaFormatException if there is a problem reading a * Relax NG schema from the file */ public static BaliAutomaton fromRNG (File file) throws FileNotFoundException, SchemaFormatException { if(!file.exists()) throw new FileNotFoundException(file.toString()); URL url = null; try { url = file.toURI().toURL(); } catch(MalformedURLException x) { assert false : x; } return fromRNG(url); } public static BaliAutomaton fromRNG (String spec) throws FileNotFoundException, SchemaFormatException { try { return fromRNG(new URL(spec)); } catch(MalformedURLException x) { return fromRNG(new File(spec)); } } /** * Encapsulate the given tree automaton. This assigns a unique * identifier to each state and transition. The content of the * transitions is used to sort them, so that the order remains * consistent across multiple runs. */ public BaliAutomaton(URL url, TreeAutomaton au) { this.url = url; this.au = au; states = au.getStates(); /* Build the names map: it is mostly used for debugging. */ StringBuilder s = new StringBuilder(); for(StringPair p : au.listNameCodes()) { if(p.namespaceURI.length() > 0) { s.append(p.namespaceURI); s.append(':'); } s.append(p.localName); names.put(au.getNameCode(p), s.toString()); s.setLength(0); } /* Transitions are stored in a 2-D array. */ trans = new Transition[states.length][]; for(int i = 0; i < states.length; i++) { assert states[i].id == i : states[i]; trans[i] = states[i].getTransitions(); /* Now we sort them: the order itself doesn’t really matter, but MUST be consistent between different runs. */ Arrays.sort(trans[i], ts); } } /** * Return the ID of the start state of the automaton. This usually * returns the integer zero, but clients should not depend on that. */ public int initialState() { return au.getInitialState().id; } /** * Return the number of states in this automaton. */ public int countStates() { return au.countStates(); } /** * Return the number of transitions from state ‘i’. */ public int countTransitions(int i) { return trans[i].length; } /** * Return the number of <em>choices</em> from state ‘i’. This is * almost the same as countTransitions, but if ‘i’ is a final * state, one more is added. In a final state with three * transitions, you really have four choices: leave by each * transition, or stay and terminate. For non-final states, this * is equivalent to countTransitions. */ public int countChoices(int i) { return trans[i].length + (states[i].isFinal? 1 : 0); } /** * Determine whether state ‘i’ is null. A null state is a dead * end: it’s not a final state, but there are no transitions * exiting it either. */ public boolean isNull(int i) { return (!states[i].isFinal && (trans[i].length == 0)); } /** * Determine whether ‘i’ is an <i>epsilon</i> state. This means * that it’s the end of the road (no transitions), but it <i>is</i> * a final state. */ public boolean isEpsilon(int i) { return states[i].isFinal && (trans[i].length == 0); } /** * Determine whether ‘i’ is a final state. It may or may not have * any transitions. */ public boolean isFinal(int i) { return states[i].isFinal; } /** * This permits clients to explore the alphabet of transition ‘tj’ * from state ‘si’. Each transition is described by an * <i>alphabet</i> that determines what must be true about the tree * in order to take that transition. There are different kinds of * alphabets, related to the element name, presence or absence of * particular attributes, character data, constant values, data * types, etc. The way to determine what alphabet is present is to * use the <code>AlphabetVisitor</code> interface from the Bali * library. */ public Object visitAlphabet(int si, int tj, AlphabetVisitor av) { return trans[si][tj].alphabet.accept(av); } /** * Returns the child state of the transition ‘tj’ from state ‘si’, * or –1 if that transition has no child. The child state is also * referred to as the ‘left’ state, in the interpretation of * arbitrary trees as binary trees. */ public int childOf(int si, int tj) { Transition tr = trans[si][tj]; if(tr.left == null || isNull(tr.left.id)) return -1; else return tr.left.id; } /** * Returns the sibling state, or destination, of the transition * ‘tj’ from state ‘si’. Every transition has a sibling state. It * is also referred to as the ‘right’ child, in the interpretation * of arbitrary trees as binary trees. */ public int siblingOf(int si, int tj) { return trans[si][tj].right.id; } /** * Returns the integer representing the qualified name * ‘ns’:‘lname’. For efficiency, Bali maps qualified names to * integers. This method provides access to the mapping. */ public int encodeName(String ns, String lname) { return au.getNameCode(ns, lname); } /** * Returns a string representation of the encoded name represented * by the integer ‘i’. For efficiency, Bali maps qualified names * to integers. This method provides the reverse mapping, which is * helpful for debugging. */ public String decodeName(int i) { return names.get(i); } /** * Dumps a text-based representation of the automaton onto the * given output stream. The representation may or may not be * sufficient to reconstruct the actualy automaton, but at least it * should be enough to <em>distinguish</em> it from other automata. * This representation is the basis of the checksum. */ public void print(PrintStream out) { int n = countStates(); out.println(n); out.println(initialState()); for(int i = 0; i < n; i++) { int m = countTransitions(i); out.println(m); int bits = 0; if(isEpsilon(i)) bits |= 1; if(isFinal(i)) bits |= 2; if(isNull(i)) bits |= 4; out.println(bits); for(int j = 0; j < m; j++) { out.println(siblingOf(i, j)); out.println(visitAlphabet(i, j, ts)); } } } /** * Compute a checksum of this automaton, using the provided * <code>Checksum</code> object. This works by creating a * <code>CheckedOutputStream</code> and calling <code>print</code> * to determine the checksum of the printed representation. The * checksum ought to be sufficient to determine—with reasonable * probability—that two schemas are the same. * @see CheckedOutputStream */ public long checksum(Checksum sum) { print(new PrintStream (new CheckedOutputStream (new NoopOutputStream(), sum))); return sum.getValue(); } /** * Compute the Adler-32 checksum of this automaton. * @see #checksum(Checksum) * @see Adler32 */ public long checksum() { return checksum(new Adler32()); } public URL getURL() { return url; } /** * This shortcut passes the tree automaton to the provided writer * from the Bali library. It can be used to validate an XML stream * against the schema, as in the <code>GenericTest</code> program. */ public void writeTo(AutomatonWriter w) throws IOException { w.write(au); } private static final boolean DEBUG = System.getProperty("DEBUG_Automaton") != null; /** * This program outputs the Adler-32 checksums of all the Relax NG * schema files named on the command line. These checksums can * then be embedded in a test suite, to ensure that they do not * change over time. To see the contents of the text stream before * the checksum is computed, set <code>-DDEBUG_Automaton</code> on * the <code>java</code> command line. * @see #checksum() */ public static void main(String[] args) throws FileNotFoundException, SchemaFormatException { Adler32 sum = new Adler32(); OutputStream out = DEBUG? System.out : new NoopOutputStream(); PrintStream pout = new PrintStream(new CheckedOutputStream(out, sum)); for(String a : args) { sum.reset(); BaliAutomaton.fromRNG(new File(a)).print(pout); System.out.printf("%08x %s%n", sum.getValue(), a); } } private class TransitionSorter implements AlphabetVisitor, Comparator<Transition> { public int compare(Transition t1, Transition t2) { /* first use ID of left branch */ if(t1.left == null && t2.left != null) return -1; if(t1.left != null && t2.left == null) return 1; if(t1.left != null) { assert t2.left != null; if(t1.left.id < t2.left.id) return -1; if(t1.left.id > t2.left.id) return 1; } /* left IDs are equal; break ties with right branch */ if(t1.right.id < t2.right.id) return -1; if(t1.right.id > t2.right.id) return 1; /* right IDs are the same; now break ties with the alphabet */ String s1 = t1.alphabet.accept(this).toString(); String s2 = t2.alphabet.accept(this).toString(); return s1.compareTo(s2); } public Object attribute( AttributeAlphabet a ) { return a; } public Object nonExistentAttribute( NonExistentAttributeAlphabet a ) { return a; } public Object element( ElementAlphabet a ) { return a; } /* the interleave and list alphabets are not supported yet. */ public Object interleave( InterleaveAlphabet a ) { assert false; return null; } public Object list( ListAlphabet a ) { assert false; return null; } public Object data( DataAlphabet a ) { return "{{DATA}}"; } public Object value( ValueAlphabet a ) { return a; } } }