package edu.hawaii.jmotif.sequitur; /* This class was modified from a Java port of Craig Nevill-Manning's Sequitur algorithm. Copyright (C) 1997 Eibe Frank */ import java.util.Hashtable; /** * Template for Sequitur data structures. Adaption of Eibe Frank code for JMotif API, see * {@link sequitur.info} for original version. * * @author Manfred Lerner, seninp * */ public abstract class SAXSymbol { /** * Apparently, this limits the possible number of terminals, ids of non-terminals start after this * num. */ protected static final int numTerminals = 100000; /** Seed the size of hash table? */ private static final int prime = 2265539; /** Hashtable to keep track of all digrams. This is static - single instance for all. */ protected static Hashtable<SAXSymbol, SAXSymbol> theDigrams = new Hashtable<SAXSymbol, SAXSymbol>( SAXSymbol.prime); /** The symbol value. */ protected String value; /** The symbol original position. */ protected int originalPosition; /** Sort of pointers for previous and the next symbols. */ protected SAXSymbol n, p; /** * Links two symbols together, removing any old digram from the hash table. * * @param left the left symbol. * @param right the right symbol. */ public static synchronized void join(SAXSymbol left, SAXSymbol right) { // check for an OLD digram existence - i.e. left must have a next symbol // if .n exists then we are joining TERMINAL symbols within the string, and must clean-up the // old digram if (left.n != null) { left.deleteDigram(); } // re-link left and right left.n = right; right.p = left; } /** * Cleans up template. */ public abstract void cleanUp(); /** * Inserts a symbol after this one. * * @param toInsert the new symbol to be inserted. */ public synchronized void insertAfter(SAXSymbol toInsert) { // call join on this symbol' NEXT - placing it AFTER the new one join(toInsert, n); // call join on THIS symbol placing the NEW AFTER join(this, toInsert); } /** * Removes the digram from the hash table. Overwritten in sub class guard. */ public synchronized void deleteDigram() { // if N is a Guard - then it is a RULE sits here, don't care about digram if (n.isGuard()) { return; } // delete digram if its exactly this one if (this == theDigrams.get(this)) { theDigrams.remove(this); } } /** * Returns true if this is the guard symbol. Overwritten in subclass guard. */ public boolean isGuard() { return false; } /** * Returns true if this is a non-terminal. Overwritten in subclass nonTerminal. */ public boolean isNonTerminal() { return false; } /** * "Checks in" a new digram and enforce the digram uniqueness constraint. If it appears elsewhere, * deals with it by calling match(), otherwise inserts it into the hash table. Overwritten in * subclass guard. * * @return true if it is not unique. */ public synchronized boolean check() { // System.out.println("[sequitur debug] *calling check() on* " + this.value + ", n isGuard: " // + n.isGuard()); // ... Each time a link is made between two symbols if the new digram is repeated elsewhere // and the repetitions do not overlap, if the other occurrence is a complete rule, // replace the new digram with the non-terminal symbol that heads the rule, // otherwise,form a new rule and replace both digrams with the new non-terminal symbol // otherwise, insert the digram into the index... if (n.isGuard()) { // i am the rule return false; } if (!theDigrams.containsKey(this)) { // System.out.println("[sequitur debug] *check...* digrams contain this (" + this.value + "~" // + this.n.value + ")? NO. Checking in."); // found = theDigrams.put(this, this); theDigrams.put(this, this); // System.out.println("[sequitur debug] *digrams* " + hash2String()); return false; } // System.out.println("[sequitur debug] *check...* digrams contain this (" + this.value // + this.n.value + ")? Yes. Oh-Oh..."); // well the same hash is in the store, lemme see... SAXSymbol found = theDigrams.get(this); // if it's not me, then lets call match magic? if (found.n != this) { // System.out.println("[sequitur debug] *double check...* IT IS NOT ME!"); match(this, found); } return true; } /** * Replace a digram with a non-terminal. */ public synchronized void substitute(SAXRule r) { // System.out.println("[sequitur debug] *substitute* " + this.value + " with rule " // + r.asDebugLine()); // clean up this place and the next r.addIndex(this.originalPosition); this.cleanUp(); this.n.cleanUp(); // link the rule instead of digram SAXNonTerminal nt = new SAXNonTerminal(r); nt.originalPosition = this.originalPosition; this.p.insertAfter(nt); // do p check // // TODO: not getting this if (!p.check()) { p.n.check(); } } /** * Deals with a matching digram. * * @param */ public synchronized void match(SAXSymbol newDigram, SAXSymbol matchingDigram) { SAXRule rule; SAXSymbol first, second; // System.out.println("[sequitur debug] *match* newDigram [" + newDigram.value + "," // + newDigram.n.value + "], old matching one [" + matchingDigram.value + "," // + matchingDigram.n.value + "]"); // if previous of matching digram is a guard if (matchingDigram.p.isGuard() && matchingDigram.n.n.isGuard()) { // reuse an existing rule rule = ((SAXGuard) matchingDigram.p).r; newDigram.substitute(rule); } else { // string built of the normal terminal symbols here? // create a new rule rule = new SAXRule(); try { // tie the digram's links together within the new rule // this uses copies of objects, so they do not get cut out of S first = (SAXSymbol) newDigram.clone(); second = (SAXSymbol) newDigram.n.clone(); rule.theGuard.n = first; first.p = rule.theGuard; first.n = second; second.p = first; second.n = rule.theGuard; rule.theGuard.p = second; // System.out.println("[sequitur debug] *newRule...* \n" + rule.getRules()); // put this digram into the hash // this effectively erases the OLD MATCHING digram with the new DIGRAM (symbol is wrapped // into Guard) theDigrams.put(first, first); // substitute the matching (old) digram with this rule in S // System.out.println("[sequitur debug] *newRule...* substitute OLD digram first."); matchingDigram.substitute(rule); // substitute the new digram with this rule in S // System.out.println("[sequitur debug] *newRule...* substitute NEW digram last."); newDigram.substitute(rule); } catch (CloneNotSupportedException c) { c.printStackTrace(); } } // Check for an underused rule. if (rule.first().isNonTerminal() && (((SAXNonTerminal) rule.first()).r.count == 1)) ((SAXNonTerminal) rule.first()).expand(); } /** * Custom hashcode implementation. Produces the hashcode for a digram using this and the next * symbol. * * @return the digram's hash code. */ public int hashCode() { int hash1 = 31; int hash2 = 13; int num0 = 0; for (int i = 0; i < value.length(); i++) { num0 = num0 + Character.getNumericValue(value.charAt(i)); } int num1 = 0; for (int i = 0; i < n.value.length(); i++) { num1 = num1 + Character.getNumericValue(n.value.charAt(i)); } hash2 = num0 * hash1 + hash2 * num1; return hash2; } /** * Test if two digrams are equal. WARNING: don't use to compare two symbols. */ public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (!(obj instanceof SAXSymbol)) return false; // return ((value == ((SAXSymbol)obj).value) && // (n.value == ((SAXSymbol)obj).n.value)); return ((value.equals(((SAXSymbol) obj).value)) && (n.value.equals(((SAXSymbol) obj).n.value))); } @Override public String toString() { return "SAXSymbol [value=" + value + ", p=" + p + ", n=" + n + "]"; } }