/******************************************************************************* * Copyright (c) 2004, 2008 John Krasnay and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * John Krasnay - initial API and implementation *******************************************************************************/ package net.sf.vex.dom; import java.util.LinkedList; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.ext.LexicalHandler; /** * A SAX handler that builds a Vex document. This builder collapses whitespace * as it goes, according to the following rules. * * <ul> * <li>Elements with style white-space: pre are left alone.</li> * <li>Runs of whitespace are replaced with a single space.</li> * <li>Space just inside the start and end of elements is removed.</li> * <li>Space just outside the start and end of block-formatted elements * is removed.</li> * </ul> */ public class DocumentBuilder implements ContentHandler, LexicalHandler { /** * Class constructor. * @param policyFactory Used to determine the WhitespacePolicy to use * for a given document type. */ public DocumentBuilder(IWhitespacePolicyFactory policyFactory) { this.policyFactory = policyFactory; } /** * Returns the newly built <code>Document</code> object. */ public Document getDocument() { return this.doc; } //============================================= ContentHandler methods public void characters(char[] ch, int start, int length) throws SAXException { // Convert nuls to spaces, since we use nulls for element delimiters char[] chars = new char[length]; System.arraycopy(ch, start, chars, 0, length); for (int i = 0; i < chars.length; i++) { if (Character.isISOControl(chars[i]) && chars[i] != '\n' && chars[i] != '\r') { chars[i] = ' '; } } this.pendingChars.append(chars); } public void endDocument() { this.doc = new Document(this.content, this.rootElement); this.doc.setPublicID(this.dtdPublicID); this.doc.setSystemID(this.dtdSystemID); this.rootElement.setDocument(this.doc); } public void endElement(String namespaceURI, String localName, String qName) { this.appendChars(true); StackEntry entry = (StackEntry) this.stack.removeLast(); // we must insert the trailing sentinel first, else the insertion // pushes the end position of the element to after the sentinel this.content.insertString(content.getLength(), "\0"); entry.element.setContent(this.content, entry.offset, content.getLength() - 1); if (this.isBlock(entry.element)) { this.trimLeading = true; } } public void endPrefixMapping(java.lang.String prefix) { } public void ignorableWhitespace(char[] ch, int start, int length) { } public void processingInstruction(String target, String data) { } public void setDocumentLocator(Locator locator) { this.locator = locator; } public void skippedEntity(java.lang.String name) { } public void startDocument() { } public void startElement(String namespaceURI, String localName, String qName, Attributes attrs) throws SAXException { try { Element element; if (stack.size() == 0) { this.rootElement = new RootElement(qName); element = this.rootElement; if (this.policyFactory != null) { this.policy = this.policyFactory.getPolicy(this.dtdPublicID); } } else { element = new Element(qName); Element parent = ((StackEntry) stack.getLast()).element; parent.addChild(element); } int n = attrs.getLength(); for (int i = 0; i < n; i++) { element.setAttribute(attrs.getQName(i), attrs.getValue(i)); } this.appendChars(this.isBlock(element)); stack.add(new StackEntry(element, content.getLength(), this.isPre(element))); content.insertString(content.getLength(), "\0"); this.trimLeading = true; } catch (DocumentValidationException ex) { throw new SAXParseException("DocumentValidationException", this.locator, ex); } } public void startPrefixMapping(String prefix, String uri) { } //============================================== LexicalHandler methods public void comment(char[] ch, int start, int length) { } public void endCDATA() { } public void endDTD() { } public void endEntity(String name) { } public void startCDATA() { } public void startDTD(String name, String publicId, String systemId) { this.dtdPublicID = publicId; this.dtdSystemID = systemId; } public void startEntity(java.lang.String name) { } //======================================================== PRIVATE private IWhitespacePolicyFactory policyFactory; private IWhitespacePolicy policy; // Holds pending characters until we see another element boundary. // This is (a) so we can collapse spaces in multiple adjacent character // blocks, and (b) so we can trim trailing whitespace, if necessary. private StringBuffer pendingChars = new StringBuffer(); // If true, trim the leading whitespace from the next received block of // text. private boolean trimLeading = false; // Content object to hold document content private Content content = new GapContent(100); // Stack of StackElement objects private LinkedList stack = new LinkedList(); private RootElement rootElement; private String dtdPublicID; private String dtdSystemID; private Document doc; private Locator locator; // Append any pending characters to the content private void appendChars(boolean trimTrailing) { StringBuffer sb; StackEntry entry = this.stack.size() > 0 ? (StackEntry) this.stack.getLast() : null; if (entry != null && entry.pre) { sb = this.pendingChars; } else{ // collapse the space in the pending characters sb = new StringBuffer(this.pendingChars.length()); boolean ws = false; // true if we're in a run of whitespace for (int i = 0; i < this.pendingChars.length(); i++) { char c = this.pendingChars.charAt(i); if (Character.isWhitespace(c)) { ws = true; } else { if (ws) { sb.append(' '); ws = false; } sb.append(c); } } if (ws) { sb.append(' '); } // trim leading and trailing space, if necessary if (this.trimLeading && sb.length() > 0 && sb.charAt(0) == ' ') { sb.deleteCharAt(0); } if (trimTrailing && sb.length() > 0 && sb.charAt(sb.length() - 1) == ' ') { sb.setLength(sb.length() - 1); } } this.normalizeNewlines(sb); this.content.insertString(this.content.getLength(), sb.toString()); this.pendingChars.setLength(0); this.trimLeading = false; } private boolean isBlock(Element element) { return this.policy != null && this.policy.isBlock(element); } private boolean isPre(Element element) { return this.policy != null && this.policy.isPre(element); } /** * Convert lines that end in CR and CRLFs to plain newlines. * * @param sb StringBuffer to be normalized. */ private void normalizeNewlines(StringBuffer sb) { // State machine states final int START = 0; final int SEEN_CR = 1; int state = START; int i = 0; while (i < sb.length()) { // No simple 'for' here, since we may delete chars char c = sb.charAt(i); switch (state) { case START: if (c == '\r') { state = SEEN_CR; } i++; break; case SEEN_CR: if (c == '\n') { // CR-LF, just delete the previous CR sb.deleteCharAt(i - 1); state = START; // no need to advance i, since it's done implicitly } else if (c == '\r') { // CR line ending followed by another // Replace the first with a newline... sb.setCharAt(i - 1, '\n'); i++; // ...and stay in the SEEN_CR state } else { // CR line ending, replace it with a newline sb.setCharAt(i - 1, '\n'); i++; state = START; } } } if (state == SEEN_CR) { // CR line ending, replace it with a newline } } private static class StackEntry { public Element element; public int offset; public boolean pre; public StackEntry(Element element, int offset, boolean pre) { this.element = element; this.offset = offset; this.pre = pre; } } }