/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.decoder.segment_file.sax_parser; import joshua.decoder.segment_file.TypeCheckingException; import joshua.decoder.segment_file.SegmentFileParser; import joshua.decoder.segment_file.Segment; import joshua.decoder.segment_file.ConstraintSpan; import joshua.decoder.segment_file.ConstraintRule; import joshua.util.Regex; import joshua.util.CoIterator; import org.xml.sax.Attributes; import org.xml.sax.Locator; import org.xml.sax.SAXParseException; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.InputStream; import java.io.FileInputStream; import java.io.File; import java.io.IOException; import java.util.LinkedList; import java.util.List; import java.util.EmptyStackException; import java.util.Stack; import java.util.logging.Level; import java.util.logging.Logger; // MAJOR TODO: We need to figure out how to make a *decent* Locator // object for our InputStream. This will allow us to (meaningfully) // construct SAXParseExceptions and call the handler methods on our // ErrorHandler interface (which should be updated to print location // info). As it stands, we use the interface for logging, but we // have no location info. // // TODO: we should also define helper methods akin to the ErrorHandler // interface, but which just take a message string and do all the // boilerplate for us. See, in particular, the bug notes about // mutable/volatile Locators. /** * This is a Xerces SAX parser for parsing files in the format * specified by SegmentFile.dtd. All extraneous tags and text are * ignored (they raise warnings, but do not halt the parser). * * @author wren ng thornton <wren@users.sourceforge.net> * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $ */ public class SAXSegmentParser extends DefaultHandler implements SegmentFileParser { /** Co-iterator for consuming output. */ private CoIterator<Segment> coit; /** * A Locater (or equivalent information) is needed to create * SAXParseExceptions. If this object is mutable or volatile * then it should not be shared by multiple SAXParseExceptions, * since that would allow locations to move out from under * the exception. See bug notes below. */ private Locator locator; // For maintaining context private boolean seenRootTag = false; private Stack<StringBuffer> tempText; private SAXSegment tempSeg; private SAXConstraintSpan tempSpan; private SAXConstraintRule tempRule; private static final Logger logger = Logger.getLogger(SAXSegmentParser.class.getName()); public SAXSegmentParser() { this.tempText = new Stack<StringBuffer>(); } //=============================================================== // SegmentFileParser Methods //=============================================================== public void parseSegmentFile(InputStream in, CoIterator<Segment> coit) throws IOException { if (null == coit) { // Maybe just return immediately instead? // Maybe use NullPointerException instead? throw new IllegalArgumentException("null co-iterator"); } this.coit = coit; // TODO: maybe ensure that this.tempText is the empty Stack? SAXParserFactory spf = SAXParserFactory.newInstance(); try { try { SAXParser sp = spf.newSAXParser(); // FIXME: this Locator is trivial, returning the "unavailable" value // for all methods. This works because we never actually call these // methods ourselves, but it's circumventing the very idea of Locator // just so that we can use the ErrorHandler interface. // // BUG: also, if the returned values ever change, it's buggy to // share this among different errors. We should always generate a // new one at exception-throwing time based on the InputStream state. this.locator = new Locator() { public String getPublicId() { return null; } public String getSystemId() { return null; } public int getLineNumber() { return -1; } public int getColumnNumber() { return -1; } }; sp.parse(in, this); // FIXME: see the bug notes above and below. // final LocatorInputStream lin = new LocatorInputStream(in); // this.locator = new Locator() { // public String getPublicId() { return null; } // FIXME // public String getSystemId() { return null; } // FIXME // public int getLineNumber() { return lin.getLineNumber(); } // public int getColumnNumber() { return lin.getColumnNumber(); } // }; // sp.parse(lin, this); // // BUG: the SAXParser (or FileInputStream?) reads in a huge buffer // at a time, so the line/column numbers from LocatorInputStream // will be very wrong, even moreso than specified by the Locator // interface. I'm not sure how we can actually determine where in // the buffer the SAXParser is when it sends us the sax events that // cause errors... // } catch (SAXParseException e) { // // TODO: something better // IOException ioe = new IOException( // "SAXParseException before line " // + e.getLineNumber() // + ", column " + e.getColumnNumber() // + ": " + e.getMessage()); // ioe.initCause(e); // throw ioe; } catch (SAXException e) { // other than SAXParseException // TODO: something better IOException ioe = new IOException( "SAXException: " + e.getMessage()); ioe.initCause(e); throw ioe; } catch (ParserConfigurationException e) { // TODO: something better IOException ioe = new IOException( "ParserConfigurationException: " + e.getMessage()); ioe.initCause(e); throw ioe; } // BUG: Do we need to close() the InputStream, or does parse() handle that? Or should clients handle that? } finally { coit.finish(); } } //=============================================================== // org.xml.sax.ContentHandler non-default event handlers // (overriding DefaultHandler) //=============================================================== // BUG: we need to deal with uri+localName vs qName, in // case anyone messes around with XML namespaces. (Assuming // they're always absent, like we do, is a bug.) But that'll // require putting SegmentFile.dtd in a public known location // (with version numbers!) so people can refer to it; too // much bother for now. public void startElement( String uri, String localName, String qName, Attributes attributes) throws SAXException { this.tempText.push(new StringBuffer()); if (! this.seenRootTag) { // Flag for so we don't warn on seeing the root tag // This is only because it's not specified in the DTD this.seenRootTag = true; } else if ("seg".equalsIgnoreCase(qName)) { String id = attributes.getValue("id"); if (null == id) { this.error(new SAXParseException( "Missing 'id' attribute for tag <seg>", this.locator)); } else { this.tempSeg = new SAXSegment(id); } } else if ("span".equalsIgnoreCase(qName)) { // TODO: helper method to combine these two blocks int start; { String startString = attributes.getValue("start"); if (null == startString) { this.error(new SAXParseException( "Missing 'start' attribute for tag <span>", this.locator)); } try { start = Integer.parseInt(startString); } catch (NumberFormatException e) { this.error(new SAXParseException( "Malformed 'start' attribute for tag <span>. Must be an integer, found: " + startString, this.locator, e)); // Unreachable, but javac fails if this isn't here start = -1; } } int end; { String endString = attributes.getValue("end"); if (null == endString) { this.error(new SAXParseException( "Missing 'end' attribute for tag <span>", this.locator)); } try { end = Integer.parseInt(endString); } catch (NumberFormatException e) { this.error(new SAXParseException( "Malformed 'end' attribute for tag <span>. Must be an integer, found: " + endString, this.locator, e)); // Unreachable, but javac fails if this isn't here end = -1; } } boolean hard; { String hardString = attributes.getValue("hard"); // Boolean.parseBoolean is too permissive if (null == hardString) { hard = false; } else if ("true".equalsIgnoreCase(hardString)) { hard = true; } else if ("false".equalsIgnoreCase(hardString)) { hard = false; } else { this.error(new SAXParseException( "Malformed 'hard' attribute for tag <span>. Must be \"true\" or \"false\", found: " + hardString, this.locator)); // Unreachable, but javac fails if this isn't here hard = false; } } try { this.tempSpan = new SAXConstraintSpan(start, end, hard); } catch (TypeCheckingException e) { this.error(new SAXParseException(null, this.locator, e)); } } else if ("constraint".equalsIgnoreCase(qName)) { this.tempRule = new SAXConstraintRule(); } else if ("lhs".equalsIgnoreCase(qName)) { // TODO: anything? } else if ("rhs".equalsIgnoreCase(qName)) { this.tempRule.setFeatures( attributes.getValue("features")); // #IMPLIED, no check for null } else { logger.warning("Skipping unknown tag: " + qName); } } public void characters(char[] ch, int start, int length) throws SAXException { try { this.tempText.peek().append(new String(ch, start, length)); } catch (EmptyStackException e) { // TODO: maybe we should throw an Error instead? this.fatalError(new SAXParseException( "The impossible happened", this.locator, e)); } } // TODO: this *is* ignorable afterall, should we just ignore it? public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { this.characters(ch, start, length); } public void endElement(String uri, String localName, String qName) throws SAXException { try { String text = this.tempText.peek().toString(); // BUG: debug for pushing nulls due to malformed files if ("seg".equalsIgnoreCase(qName)) { if (null == this.tempSeg) { this.fatalError(new SAXParseException( "Found </seg> but segment was null (missing root tag?)", this.locator)); } else { try { Segment seg = this.tempSeg.typeCheck(text); this.tempSeg = null; this.coit.coNext(seg); } catch (TypeCheckingException e) { this.error(new SAXParseException( null, this.locator, e)); } } } else if ("span".equalsIgnoreCase(qName)) { ignoringTextWarning(qName, text); this.tempSeg.addSpan(this.tempSpan); this.tempSpan = null; } else if ("constraint".equalsIgnoreCase(qName)) { ignoringTextWarning(qName, text); this.tempSpan.addRule(this.tempRule); this.tempRule = null; } else if ("lhs".equalsIgnoreCase(qName)) { this.tempRule.setLhs(text); } else if ("rhs".equalsIgnoreCase(qName)) { this.tempRule.setRhs(text); } else { ignoringTextWarning(qName, text); } this.tempText.pop(); } catch (EmptyStackException e) { // TODO: maybe we should throw an Error instead? this.fatalError(new SAXParseException( "The impossible happened", this.locator, e)); } } private static final Regex WHITESPACE_ONLY = new Regex("^\\s*$"); private void ignoringTextWarning(String qName, String text) { if (logger.isLoggable(Level.WARNING) && ! WHITESPACE_ONLY.matches(text)) { String cleanText = Regex.spaces.replaceAll(text, " ").trim(); logger.warning( "Ignoring extraneous text in <" + qName + ">: " + cleanText); } } //=============================================================== // org.xml.sax.ErrorHandler event handlers (overriding DefaultHandler) //=============================================================== // TODO: print the location info in the SAXParseException as well /** * Respond to recoverable warnings. */ public void warning(SAXParseException e) throws SAXException { if (logger.isLoggable(Level.WARNING)) logger.warning(e.getMessage()); } /** * Respond to recoverable errors like validity violations. */ public void error(SAXParseException e) throws SAXException { // FIXME: is that the right logging level? if (logger.isLoggable(Level.WARNING)) logger.warning(e.getMessage()); throw e; } /** * Respond to non-recoverable errors like well-formedness * violations. */ public void fatalError(SAXParseException e) throws SAXException { if (logger.isLoggable(Level.SEVERE)) logger.severe(e.getMessage()); throw e; } //=============================================================== // Main method (for debugging, should be moved to ./test/ somewhere) //=============================================================== public static void main(String[] args) throws IOException { if (1 != args.length) { System.out.println("Usage: java SAXSegmentParser segmentFile.xml"); System.exit(1); } final List<Segment> segments = new LinkedList<Segment>(); new SAXSegmentParser().parseSegmentFile( new FileInputStream(new File(args[0])), new CoIterator<Segment>() { public void coNext(Segment segment) { segments.add(segment); } public void finish() {} }); int segs = 0; for (Segment s : segments) { ++segs; System.out.println(s.sentence()); int spans = 0; for (ConstraintSpan span : s.constraints()) { ++spans; System.out.println( "<span start=\"" + span.start() + "\" end=\"" + span.end() + "\" hard=\"" + span.isHard() + "\">"); int rs = 0; for (ConstraintRule rule : span.rules()) { ++rs; System.out.println( "<lhs>" + rule.lhs() // FIXME: array printing is buggy in Java + "</lhs><rhs features=\"" + rule.features() + "\">" + rule.foreignRhs() + " ||| " + rule.nativeRhs() + "</rhs>"); } System.out.println("# Parsed " + rs + " rules for this span"); System.out.println("</span>"); } System.out.println("# Parsed " + spans + " spans for this segment"); System.out.println(""); } System.out.println("# Parsed " + segs + " segments"); } }