/** * Copyright 2009 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.waveprotocol.wave.model.richtext; import org.waveprotocol.wave.model.document.util.ElementStyleView; import org.waveprotocol.wave.model.richtext.RichTextTokenizer.Type.TypeGroup; import org.waveprotocol.wave.model.util.CollectionUtils; import org.waveprotocol.wave.model.util.StringMap; import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; /** * W3C implementation of the RichTextTokenizer. * * TODO(user): Optimize by changing this to a SAX-like implementation * instead of DOM. Ino ther words, iteratively visit every node instead of * pre-processing into a token list. * * TODO(user): Turn the static "isThis, isThat" helper functions * into a more data-driven mapping system. * * TODO(user): Add support for lists, headers, images and custom * style tags on block elements. * */ public class RichTextTokenizerImpl<N, E extends N, T extends N> implements RichTextTokenizer { /** * Internal token class, exposed via get methods on the parent class. */ static class Token { private final Type type; private final String data; Token(Type type) { this(type, null); } Token(Type type, String data) { this.type = type; this.data = data; } Type getType() { return type; } String getData() { return data; } @Override public String toString() { return "(" + type + "," + data + ")"; } } /** * Contains data required to extract a particular type of style token from an * element. */ private static class StyleTokenExtractor { final Type tokenStartType; final Type tokenEndType; final StringMap<String> tagToValue; final String stylePropertyName; private StyleTokenExtractor(Type tokenStartType, Type tokenEndType, StringMap<String> tagToValue, String stylePropertyName) { this.tokenStartType = tokenStartType; this.tokenEndType = tokenEndType; this.tagToValue = tagToValue; this.stylePropertyName = stylePropertyName; } } /** * Mapping from tag names to corresponding token value. */ private static final StringMap<String> fontWeightMap; private static final StringMap<String> fontStyleMap; private static final StringMap<String> textDecorationMap; static { fontWeightMap = CollectionUtils.createStringMap(); fontWeightMap.put("b", "bold"); fontWeightMap.put("strong", "bold"); fontStyleMap = CollectionUtils.createStringMap(); fontStyleMap.put("i", "italic"); fontStyleMap.put("em", "italic"); textDecorationMap = CollectionUtils.createStringMap(); textDecorationMap.put("u", "underline"); } private static final StyleTokenExtractor FONT_WEIGHT_HANDLER = new StyleTokenExtractor(Type.STYLE_FONT_WEIGHT_START, Type.STYLE_FONT_WEIGHT_END, fontWeightMap, "fontWeight"); private static final StyleTokenExtractor FONT_STYLE_HANDLER = new StyleTokenExtractor(Type.STYLE_FONT_STYLE_START, Type.STYLE_FONT_STYLE_END, fontStyleMap, "fontStyle"); private static final StyleTokenExtractor TEXT_DECORATION_HANDLER = new StyleTokenExtractor(Type.STYLE_TEXT_DECORATION_START, Type.STYLE_TEXT_DECORATION_END, textDecorationMap, "textDecoration"); private final List<Token> tokenList; private final int[] activeTokenCounts; protected final ElementStyleView<N, E, T> document; private int tokenIndex = -1; private boolean endBlockPending = false; private E root = null; private boolean mergeNextNewLine = false; /** * Creates a tokenizer and parses the inner contents of an Element. * * @param doc The readable document that will be parsed. */ public RichTextTokenizerImpl(ElementStyleView<N, E, T> doc) { document = doc; tokenList = new ArrayList<Token>(); activeTokenCounts = new int[Type.values().length]; for (int i = 0; i < activeTokenCounts.length; ++i) { activeTokenCounts[i] = 0; } process(doc.getDocumentElement()); } private RichTextTokenizerImpl(RichTextTokenizerImpl<N, E, T> o) { activeTokenCounts = new int[o.activeTokenCounts.length]; for (int i = 0; i < o.activeTokenCounts.length; ++i) { activeTokenCounts[i] = o.activeTokenCounts[i]; } tokenList = new ArrayList<Token>(o.tokenList); document = o.document; tokenIndex = o.tokenIndex; endBlockPending = o.endBlockPending; root = o.root; mergeNextNewLine = o.mergeNextNewLine; } @Override public RichTextTokenizer copy() { return new RichTextTokenizerImpl<N, E, T>(this); } @Override public boolean hasNext() { return tokenIndex < tokenList.size() - 1; } @Override public Type next() { if (!hasNext()) { throw new NoSuchElementException(); } ++tokenIndex; return getCurrentToken().getType(); } @Override public Type getCurrentType() { return getCurrentToken().getType(); } @Override public String getData() { return getCurrentToken().getData(); } private Token getCurrentToken() { if (tokenIndex >= tokenList.size()) { throw new IllegalStateException("No token available."); } return tokenList.get(tokenIndex); } final protected void process(E container) { if (container == null) { throw new IllegalArgumentException(); } root = container; tokenList.clear(); tokenIndex = -1; N prevNode = null; for (N child = document.getFirstChild(root); child != null; child = document.getNextSibling(child)) { processNode(child, prevNode); prevNode = child; } // Sanity check, all counters should be zero. for (int j = 0 ; j < activeTokenCounts.length; ++j) { assert activeTokenCounts[j] == 0; } } private void processNode(N node, N leftSibling) { T textNode = document.asText(node); if (textNode != null) { processTextNode(textNode, leftSibling); return; } E element = asElement(node); if (element != null) { processElement(element, leftSibling); } } final void processTextNode(T textNode, N leftSibling) { if (endBlockPending) { maybeInsertNewline(); endBlockPending = false; } processTextNodeInner(textNode, leftSibling); } protected void processTextNodeInner(T textNode, N leftSibling) { String text = document.getData(textNode); if (!text.isEmpty()) { addTextToken(text); } } final void addTextToken(String text) { StringBuilder builder = new StringBuilder(text.length()); for (int i = 0; i < text.length(); ++i) { char ch = text.charAt(i); if (ch == '\u00a0') { builder.append(' '); } else if (ch == '\t') { builder.append(" "); } else if (ch == '\n') { if (builder.length() != 0) { addToken(new Token(Type.TEXT, builder.toString())); } addToken(new Token(Type.NEW_LINE)); builder = new StringBuilder(text.length() - i); } else { builder.append(ch); } } if (builder.length() != 0) { addToken(new Token(Type.TEXT, builder.toString())); } } private void processElement(E element, N leftSibling) { String tagName = document.getTagName(element).toLowerCase(); List<Type> closingTypeStack = new ArrayList<Type>(); // must only contain annotation closures boolean maybeEndParagraph = false; boolean setEndBlockPending = false; if (isBlockElement(tagName)) { if (isListItem(tagName)) { addToken(new Token(Type.LIST_ITEM)); } else { // TODO(user): This will always ensure that there cannot be // nested paragraphs. However, because <p> and <div> are treated the // same, nested divs will result in extra paragraphs. The solution // is to flatten the divs when no renderable content exists. if (!ignorableBlock(element, tagName)) { maybeInsertNewline(); maybeEndParagraph = true; setEndBlockPending = true; } } } else if (isNewline(tagName)) { // Special case - if this is the last line break in a paragraph, just // ignore it. if (!isLastLinebreak(element)) { addToken(new Token(Type.NEW_LINE)); mergeNextNewLine = false; } } else if (isHeading(tagName)) { addToken(new Token(Type.NEW_LINE, tagName)); maybeEndParagraph = true; setEndBlockPending = true; } else if (isList(tagName)) { putIfNotNull(closingTypeStack, handleListElement(element, tagName)); maybeEndParagraph = false; setEndBlockPending = true; } else if (isTableRelated(tagName)) { // TODO(patcoleman): temporary table rendering, replace with real tables once supported. if (isTable(tagName)) { addToken(new Token(Type.NEW_LINE)); } else if (isTableRow(tagName)) { maybeEndParagraph = true; setEndBlockPending = true; } else if (isTableCell(tagName)) { addToken(new Token(Type.TEXT, " ")); } } else { putIfNotNull(closingTypeStack, handleLinkElement(element, tagName)); } handleStyleElements(element, tagName, closingTypeStack); // Recursively iterate children. N prevNode = null; for (N child = document.getFirstChild(element); child != null; child = document.getNextSibling(child)) { processNode(child, prevNode); prevNode = child; } while (!closingTypeStack.isEmpty()) { Type closingType = closingTypeStack.remove(closingTypeStack.size() - 1); addToken(new Token(closingType)); decrementTypeCounter(closingType); } endBlockPending |= setEndBlockPending; if (maybeEndParagraph && endBlockPending) { maybeInsertNewline(); endBlockPending = false; } } private <E> void putIfNotNull(List<E> list, E item) { if (item != null) { list.add(item); } } private boolean maybeInsertNewline() { // Only add the newline if we didn't just end a block with a close tag. if (!mergeNextNewLine) { addToken(new Token(Type.NEW_LINE)); return true; } return false; } /** * Checks the element for various style properties in its tag name or css styles. * All styles found generate a starting token, plus add an end token to the closing stack. */ private void handleStyleElements(E el, String tagName, List<Type> closeStack) { Type startType = null; Type endType = null; String data = null; // Styles supported here: bold, italic, font colour, background colour, font family. maybeExtractStyleToken(el, tagName, closeStack, FONT_WEIGHT_HANDLER); maybeExtractStyleToken(el, tagName, closeStack, FONT_STYLE_HANDLER); maybeExtractStyleToken(el, tagName, closeStack, TEXT_DECORATION_HANDLER); if (isColor(el)) { String value = document.getStylePropertyValue(el, "color"); addTokenOrIncrement(true, new Token(Type.STYLE_COLOR_START, value), Type.STYLE_COLOR_END); closeStack.add(Type.STYLE_COLOR_END); } if (isBackgroundColor(el)) { String value = document.getStylePropertyValue(el, "backgroundColor"); addTokenOrIncrement(true, new Token(Type.STYLE_BG_COLOR_START, value), Type.STYLE_BG_COLOR_END); closeStack.add(Type.STYLE_BG_COLOR_END); } if (isFontFamily(el)) { String value = document.getStylePropertyValue(el, "fontFamily"); addTokenOrIncrement(true, new Token(Type.STYLE_FONT_FAMILY_START, value), Type.STYLE_FONT_FAMILY_END); closeStack.add(Type.STYLE_FONT_FAMILY_END); } } private Type handleLinkElement(E el, String tagName) { if (isLink(tagName)) { String attr = document.getAttribute(el, "href"); if (attr != null) { addTokenOrIncrement(new Token(Type.LINK_START, attr), Type.LINK_END); return Type.LINK_END; } } return null; } private Type handleListElement(E element, String tagName) { if (isOrderedList(tagName)) { addTokenOrIncrement(true, new Token(Type.ORDERED_LIST_START), Type.ORDERED_LIST_END); return Type.ORDERED_LIST_END; } else if (isUnorderedList(tagName)) { addTokenOrIncrement(true, new Token(Type.UNORDERED_LIST_START), Type.UNORDERED_LIST_END); return Type.UNORDERED_LIST_END; } else { return null; } } private void addToken(Token token) { if (token.getType().isStructural()) { mergeNextNewLine = token.getType().group() == TypeGroup.BLOCK; } tokenList.add(token); } private void addTokenOrIncrement(Token token, Type endType) { addTokenOrIncrement(false, token, endType); } private void addTokenOrIncrement(boolean replace, Token token, Type endType) { if (replace || !isTypeInUse(endType)) { addToken(token); } incrementTypeCounter(endType); } private E asElement(N node) { if (node != null) { return document.asElement(node); } return null; } /** * Checks if this is the last line break before a new paragraph. */ private boolean isLastLinebreak(E element) { N sibling = document.getNextSibling(element); // If we are not the last child and do not border a block element, // then we cannot ignore this linebreak. if (sibling != null) { E el = asElement(sibling); if (el == null || !isBlockElement(document.getTagName(el))) { return false; } } // Make sure we're not deeply nested. return getDepthFromBlock(element) == 1; } private boolean ignorableBlock(E element, String tagName) { // Ignore empty divs. if ("div".equalsIgnoreCase(tagName)) { if (document.getFirstChild(element) == null) { return true; } /* int offsetHeight = element.getOffsetHeight(); if (element.getOffsetHeight() == 0) { return true; } */ } return false; } private int getDepthFromBlock(N node) { int depth = 1; E e = document.getParentElement(node); while (e != null) { if (e == root || isBlockElement(document.getTagName(e))) { break; } depth++; e = document.getParentElement(e); } return depth; } private boolean isTypeInUse(Type type) { return activeTokenCounts[type.ordinal()] > 0; } private void incrementTypeCounter(Type type) { ++activeTokenCounts[type.ordinal()]; } private boolean decrementTypeCounter(Type type) { int newValue = --activeTokenCounts[type.ordinal()]; assert newValue >= 0; return newValue == 0; } private static boolean isBlockElement(String tagName) { return "p".equalsIgnoreCase(tagName) || "div".equalsIgnoreCase(tagName) || isListItem(tagName); } private static boolean isNewline(String tagName) { return "br".equalsIgnoreCase(tagName); } private void maybeExtractStyleToken(E el, String tagName, List<Type> closeStack, StyleTokenExtractor tokenHandler) { String styleProperty = getStyleProperty(el, tagName, tokenHandler); if (styleProperty != null) { addTokenOrIncrement(true, new Token(tokenHandler.tokenStartType, styleProperty), tokenHandler.tokenEndType); closeStack.add(tokenHandler.tokenEndType); } } private String getStyleProperty(E el, String tagName, StyleTokenExtractor tokenHandler) { String value = document.getStylePropertyValue(el, tokenHandler.stylePropertyName); if (value != null && !value.isEmpty()) { return value; } String lowerCaseTag = tagName.toLowerCase(); return tokenHandler.tagToValue.containsKey(lowerCaseTag) ? tokenHandler.tagToValue .get(lowerCaseTag) : null; } private boolean isColor(E el) { return isStylePropertySet(el, "color"); } private boolean isBackgroundColor(E el) { return isStylePropertySet(el, "backgroundColor"); } private boolean isFontFamily(E el) { return isStylePropertySet(el, "fontFamily"); } private static boolean isHeading(String tagName) { if (tagName.length() == 2) { if (tagName.charAt(0) == 'h' || tagName.charAt(0) == 'H') { int size = tagName.charAt(1) - '0'; if (size >= 1 && size <= 4) { return true; } } } return false; } private static boolean isLink(String tagName) { return "a".equalsIgnoreCase(tagName); } private static boolean isListItem(String tagName) { return "li".equalsIgnoreCase(tagName); } private static boolean isList(String tagName) { return isOrderedList(tagName) || isUnorderedList(tagName); } private static boolean isOrderedList(String tagName) { return "ol".equalsIgnoreCase(tagName); } private static boolean isUnorderedList(String tagName) { return "ul".equalsIgnoreCase(tagName); } private static boolean isTable(String tagName) { return "table".equalsIgnoreCase(tagName); } private static boolean isTableRow(String tagName) { return "tr".equalsIgnoreCase(tagName); } private static boolean isTableCell(String tagName) { return "th".equalsIgnoreCase(tagName) || "td".equalsIgnoreCase(tagName); } private static boolean isTableRelated(String tagName) { // TODO(patcoleman): fix up table implementation once tables supported in the editor. // When this happens, also extract out strings into symbolic constants. return isTable(tagName) || isTableRow(tagName) || isTableCell(tagName) || "thead".equalsIgnoreCase(tagName) || "tbody".equalsIgnoreCase(tagName); } private boolean isStylePropertySet(E el, String property) { String value = document.getStylePropertyValue(el, property); return value != null && !value.isEmpty(); } @Override public String toString() { return tokenList.toString(); } }