/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.clearsilver.jsilver.template; import java.io.IOException; /** * HTML whitespace stripper to be used by JSilver. It removes leading and * trailing whitespace, it reduces contiguous whitespace characters with just * the first character, and removes lines of nothing but whitespace. * * It does not strip whitespace inside the following elements: * <ul> * <li> PRE * <li> VERBATIM * <li> TEXTAREA * <li> SCRIPT * </ul> * It also strips out empty lines and leading whitespace inside HTML tags (i.e. * between '<' and '>') and inside SCRIPT elements. It leaves trailing * whitespace since that is more costly to remove and tends to not be common * based on how templates are created (they don't have trailing whitespace). * <p> * Loadtests indicate that this class can strip whitespace almost as quickly * as just reading every character from a string (20% slower). * <p> * While not strictly compatible with the JNI Clearsilver whitestripping * function, we are not aware of any differences that yield functionally * different HTML output. However, we encourage users to verify for themselves * and report any differences. */ public class HtmlWhiteSpaceStripper implements Appendable { // Object to output stripped content to. private final Appendable out; // Level of whitespace stripping to perform. (Currently not used). // TODO: Determine what the exact differences are in levels in // JNI Clearsilver and see if it is worth porting it. private final int level; // Has any non-whitespace character been seen since the start of the line. private boolean nonWsSeen = false; // Was there previously one or more whitespace chars? If so, we should output // the first whitespace char in the sequence before any other non-whitespace // character. 0 signifies no pending whitespace. private char pendingWs = 0; // We just saw the start of an HTML tag '<'. private boolean startHtmlTag = false; // Are we currently in an opening HTML tag (not "</"). private boolean inOpenTag = false; // Are we currently in a closing HTML tag. private boolean inCloseTag = false; // Are we currently in an HTML tag name. private boolean inTagName = false; // Are we between <textarea> tags private int textAreaScope = 0; // Are we between <pre> tags private int preScope = 0; // Are we between verbatim flags private int verbatimScope = 0; // Are we between <script> tags private int scriptScope = 0; // Used to hold HTML tag element name. private StringBuilder tagName = new StringBuilder(16); /** * Intermediate Appendable object that strips whitespace as it passes through characters to * another Appendable object. * * @param out The Appendable object to dump the stripped output to. */ public HtmlWhiteSpaceStripper(Appendable out) { this(out, 1); } /** * Intermediate Appendable object that strips whitespace as it passes through characters to * another Appendable object. * * @param out The Appendable object to dump the stripped output to. * @param level Ignored for now. */ public HtmlWhiteSpaceStripper(Appendable out, int level) { this.out = out; this.level = level; } @Override public String toString() { return out.toString(); } @Override public Appendable append(CharSequence csq) throws IOException { return append(csq, 0, csq.length()); } @Override public Appendable append(CharSequence csq, int start, int end) throws IOException { for (int i = start; i < end; i++) { append(csq.charAt(i)); } return this; } @Override public Appendable append(char c) throws IOException { if (inOpenTag || inCloseTag) { // In an HTML tag. if (startHtmlTag) { // This is the first character in an HTML tag. if (c == '/') { // We are in a close tag. inOpenTag = false; inCloseTag = true; } else { // This is the first non-'/' character in an HTML tag. startHtmlTag = false; if (isTagNameStartChar(c)) { // we have a valid tag name first char. inTagName = true; tagName.append(c); } } } else if (inTagName) { // We were last parsing the name of an HTML attribute. if (isTagNameChar(c)) { tagName.append(c); } else { processTagName(); inTagName = false; } } if (c == '>') { // We are at the end of the tag. inOpenTag = inCloseTag = false; nonWsSeen = true; } stripLeadingWsAndEmptyLines(c); } else { // Outside of HTML tag. if (c == '<') { // Starting a new HTML tag. inOpenTag = true; startHtmlTag = true; } if (preScope > 0 || verbatimScope > 0 || textAreaScope > 0) { // In an HTML element that we want to preserve whitespace in. out.append(c); } else if (scriptScope > 0) { // Want to remove newlines only. stripLeadingWsAndEmptyLines(c); } else { stripAll(c); } } return this; } private void stripLeadingWsAndEmptyLines(char c) throws IOException { // Detect and delete empty lines. switch (c) { case '\n': if (nonWsSeen) { out.append(c); } nonWsSeen = false; break; case ' ': case '\t': case '\r': if (nonWsSeen) { out.append(c); } break; default: if (!nonWsSeen) { nonWsSeen = true; } out.append(c); } } private void stripAll(char c) throws IOException { // All that remains is content that is safe to remove whitespace from. switch (c) { case '\n': if (nonWsSeen) { // We don't want blank lines so we don't output linefeed unless we // saw non-whitespace. out.append(c); } // We don't want trailing whitespace. pendingWs = 0; nonWsSeen = false; break; case ' ': case '\t': case '\r': if (nonWsSeen) { pendingWs = c; } else { // Omit leading whitespace } break; default: if (pendingWs != 0) { out.append(pendingWs); pendingWs = 0; } nonWsSeen = true; out.append(c); } } private int updateScope(int current, int inc) { current += inc; return current < 0 ? 0 : current; } /** * This code assumes well-formed HTML as input with HTML elements opening and closing properly in * the right order. */ private void processTagName() { inTagName = false; String name = tagName.toString(); tagName.delete(0, tagName.length()); int inc = inOpenTag ? 1 : -1; if ("textarea".equalsIgnoreCase(name)) { textAreaScope = updateScope(textAreaScope, inc); } else if ("pre".equalsIgnoreCase(name)) { preScope = updateScope(preScope, inc); } else if ("verbatim".equalsIgnoreCase(name)) { verbatimScope = updateScope(verbatimScope, inc); } else if ("script".equalsIgnoreCase(name)) { scriptScope = updateScope(scriptScope, inc); } } private boolean isTagNameStartChar(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); } // From W3C HTML spec. private boolean isTagNameChar(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_') || (c == '-') || (c == ':') || (c == '.'); } /** * Note, we treat '\n' as a separate special character as it has special rules since it determines * what a 'line' of content is for doing leading and trailing whitespace removal and empty line * removal. */ private boolean isWs(char c) { return c == ' ' || c == '\t' || c == '\r'; } }