package edu.hawaii.jmotif.sequitur; /* This class is part of a Java port of Craig Nevill-Manning's Sequitur algorithm. Copyright (C) 1997 Eibe Frank This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ import java.util.ArrayList; import java.util.Arrays; import java.util.Hashtable; import java.util.Set; import java.util.TreeSet; import java.util.Vector; import java.util.concurrent.atomic.AtomicInteger; /** * The Rule. Adaption of Eibe Frank code for JMotif API, see {@link sequitur.info} for original * version. * * @author Manfred Lerner, seninp * */ public class SAXRule { // // The rule utility constraint demands that a rule be deleted if it is referred to only once. Each // rule has an associated reference count, which is incremented when a non-terminal symbol that // references the rule is created, and decremented when the non-terminal symbol is deleted. When // the reference count falls to one, the rule is deleted. /** This is static - the global rule enumerator counter. */ protected static AtomicInteger numRules; /** Guard symbol to mark beginning and end of rule. */ protected SAXGuard theGuard; /** Counter keeps track of how many times the rule is used in the grammar. */ protected int count; /** The rule's number. Used for identification of non-terminals. */ protected int ruleIndex; /** Index used for printing. */ protected int index; /** * This keeps rule indexes - once rule created or used, its placement position is extracted from * the TerminalSymbol position and stored here. */ protected static Set<Integer> indexes = new TreeSet<Integer>(); /** * Yet another, global static structure allowing fast rule access, ADD-ON by seninp to the * original code. */ protected static Hashtable<Integer, SAXRule> theRules = new Hashtable<Integer, SAXRule>(); /** Keeper for rules references, ADD-ON by Manfred to the original code. */ // protected static ArrayList<String> arrayRuleStrings = new ArrayList<String>(); protected static ArrayList<SAXRuleRecord> arrSAXRuleRecords = new ArrayList<SAXRuleRecord>(); /** * Constructor. */ public SAXRule() { // assign a next number to this rule and increment the global counter this.ruleIndex = numRules.intValue(); numRules.incrementAndGet(); // create a Guard handler for the rule synchronized (this) { this.theGuard = new SAXGuard(this); } if (null == this.theGuard) { System.out.println("gotcha 1"); } // init other vars this.count = 0; this.index = 0; // save the instance theRules.put(this.ruleIndex, this); } /** * Report the FIRST symbol of the rule. * * @return the FIRST rule's symbol. */ public SAXSymbol first() { if (null == this.theGuard) { System.out.println(theRules.values()); System.out.println("gotcha 2"); } return this.theGuard.n; } /** * Report the LAST symbol of the rule. * * @return the LAST rule's symbol. */ public SAXSymbol last() { return this.theGuard.p; } /** * Original getRules() method. Prints out rules. Funny, that in the original code it is only * possible to call it on the head - i.e. Grammar. * * @return the formatted rules string. */ public synchronized String getRules() { Vector<SAXRule> rules = new Vector<SAXRule>(numRules.intValue()); SAXRule currentRule; SAXRule referedTo; SAXSymbol sym; int index; int processedRules = 0; StringBuffer text = new StringBuffer(); text.append("Usage\tRule\n"); rules.addElement(this); // add-on - keeping the rule string, will be used in order to expand rules StringBuilder currentRuleString = new StringBuilder(); while (processedRules < rules.size()) { currentRule = rules.elementAt(processedRules); text.append(" "); text.append(currentRule.count); text.append("\tR"); text.append(processedRules); text.append(" -> "); for (sym = currentRule.first(); (!sym.isGuard()); sym = sym.n) { if (sym.isNonTerminal()) { referedTo = ((SAXNonTerminal) sym).r; if ((rules.size() > referedTo.index) && (rules.elementAt(referedTo.index) == referedTo)) { index = referedTo.index; } else { index = rules.size(); referedTo.index = index; rules.addElement(referedTo); } text.append('R'); text.append(index); currentRuleString.append('R'); currentRuleString.append(index); } else { if (sym.value.equals(" ")) { text.append('_'); currentRuleString.append('_'); } else { if (sym.value.equals("\n")) { text.append("\\n"); currentRuleString.append("\\n"); } else { text.append(sym.value); currentRuleString.append(sym.value); } } } text.append(' '); currentRuleString.append(' '); } // seninp: adding to original output rule occurrence indexes // text.append("\tidx:"); text.append(Arrays.toString(currentRule.getIndexes())); // // and rules map fill-in // arrayRuleStrings.add(currentRuleString.toString()); currentRuleString = new StringBuilder(); text.append('\n'); processedRules++; } return text.toString(); } /** * Add-on to the original code by manfred and seninp. This one similar to the original getRules() * but populates and returns the array list of SAXRuleRecords. * * @return list of SAXRuleRecords. */ public synchronized ArrayList<SAXRuleRecord> getSAXRules() { Vector<SAXRule> rules = new Vector<SAXRule>(numRules.intValue()); SAXRule currentRule; SAXRule referedTo; SAXSymbol sym; int index; int processedRules = 0; StringBuffer text = new StringBuffer(); text.append("Usage\tRule\n"); rules.addElement(this); StringBuilder sbCurrentRule = new StringBuilder(); while (processedRules < rules.size()) { currentRule = rules.elementAt(processedRules); text.append(" "); text.append(currentRule.count); text.append("\tR"); text.append(processedRules); text.append(" -> "); for (sym = currentRule.first(); (!sym.isGuard()); sym = sym.n) { if (sym.isNonTerminal()) { referedTo = ((SAXNonTerminal) sym).r; if ((rules.size() > referedTo.index) && (rules.elementAt(referedTo.index) == referedTo)) { index = referedTo.index; } else { index = rules.size(); referedTo.index = index; rules.addElement(referedTo); } text.append('R'); text.append(index); sbCurrentRule.append('R'); sbCurrentRule.append(index); } else { if (sym.value.equals(" ")) { text.append('_'); sbCurrentRule.append('_'); } else { if (sym.value.equals("\n")) { text.append("\\n"); sbCurrentRule.append("\\n"); } else { text.append(sym.value); sbCurrentRule.append(sym.value); } } } text.append(' '); sbCurrentRule.append(' '); } // TODO: seninp: adding occurrence indexes // text.append("\tidx:"); text.append(Arrays.toString(currentRule.getIndexes())); text.append('\n'); // System.out.println(text.toString()); SAXRuleRecord saxContainer = new SAXRuleRecord(); saxContainer.setRuleIndex(processedRules); saxContainer.setRuleFrequency(currentRule.count); saxContainer.setRuleName("R" + processedRules); saxContainer.setRuleString(sbCurrentRule.toString()); saxContainer.setIndexes(currentRule.getIndexes()); arrSAXRuleRecords.add(saxContainer); sbCurrentRule = new StringBuilder(); processedRules++; } expandRules(); // computeRuleOffsets(); return arrSAXRuleRecords; } /** * Manfred's cool trick to get out all expanded rules. Expands the rule of each SAX container into * SAX words string. * * @return */ public synchronized void expandRules() { // vars int currentPositionIndex = 0; int workIndex = 0; String resultString = null; SAXRuleRecord saxContainer = null; // iterate over all SAX containers for (currentPositionIndex = 0; currentPositionIndex < arrSAXRuleRecords.size(); currentPositionIndex++) { saxContainer = arrSAXRuleRecords.get(currentPositionIndex); resultString = saxContainer.getRuleString(); // here it goes over the rule string iteratively expanding the rules. trick is that rules // start with "R" // workIndex = 0; while (resultString.contains("R")) { resultString = resultString.replaceAll("R" + workIndex + " ", arrSAXRuleRecords.get(workIndex).getRuleString()); if (workIndex == arrSAXRuleRecords.size() - 1) workIndex = 0; ++workIndex; } // need to trim space at the very end saxContainer.setExpandedRuleString(resultString.trim()); } } public synchronized ArrayList<SAXRuleRecord> getSAXContainerList() { return arrSAXRuleRecords; } public void addIndex(int position) { indexes.add(position); } private int[] getIndexes() { int[] res = new int[indexes.size()]; int i = 0; for (Integer idx : indexes) { res[i] = idx; i++; } return res; } public String getGrammarDisplayString() { Vector<SAXRule> rules = new Vector<SAXRule>(numRules.intValue()); SAXRule referedTo; SAXSymbol sym; int index; StringBuffer text = new StringBuffer(); text.append("Usage\tRule\n"); rules.addElement(this); StringBuilder sbCurrentRule = new StringBuilder(); SAXRule currentRule = rules.get(0); for (sym = currentRule.first(); (!sym.isGuard()); sym = sym.n) { if (sym.isNonTerminal()) { referedTo = ((SAXNonTerminal) sym).r; if ((rules.size() > referedTo.index) && (rules.elementAt(referedTo.index) == referedTo)) { index = referedTo.index; } else { index = rules.size(); referedTo.index = index; rules.addElement(referedTo); } text.append('R'); text.append(index); sbCurrentRule.append('R'); sbCurrentRule.append(index); } else { if (sym.value.equals(" ")) { text.append('_'); sbCurrentRule.append('_'); } else { if (sym.value.equals("\n")) { text.append("\\n"); sbCurrentRule.append("\\n"); } else { text.append(sym.value); sbCurrentRule.append(sym.value); } } } text.append(' '); sbCurrentRule.append(' '); } return sbCurrentRule.toString(); } }