RichTextTokenizerImpl.java example

Explorer
WaveInCloud-master
/**
 * Copyright 2009 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.waveprotocol.wave.model.richtext;

import org.waveprotocol.wave.model.document.util.ElementStyleView;
import org.waveprotocol.wave.model.richtext.RichTextTokenizer.Type.TypeGroup;
import org.waveprotocol.wave.model.util.CollectionUtils;
import org.waveprotocol.wave.model.util.StringMap;

import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;

/**
 * W3C implementation of the RichTextTokenizer.
 *
 * TODO(user): Optimize by changing this to a SAX-like implementation
 * instead of DOM. Ino ther words, iteratively visit every node instead of
 * pre-processing into a token list.
 *
 * TODO(user): Turn the static "isThis, isThat" helper functions
 * into a more data-driven mapping system.
 *
 * TODO(user): Add support for lists, headers, images and custom
 * style tags on block elements.
 *
 */
public class RichTextTokenizerImpl<N, E extends N, T extends N> implements RichTextTokenizer {
  /**
   * Internal token class, exposed via get methods on the parent class.
   */
  static class Token {
    private final Type type;

    private final String data;

    Token(Type type) {
      this(type, null);
    }

    Token(Type type, String data) {
      this.type = type;
      this.data = data;
    }

    Type getType() {
      return type;
    }

    String getData() {
      return data;
    }

    @Override
    public String toString() {
      return "(" + type + "," + data + ")";
    }
  }

  /**
   * Contains data required to extract a particular type of style token from an
   * element.
   */
  private static class StyleTokenExtractor {
    final Type tokenStartType;
    final Type tokenEndType;
    final StringMap<String> tagToValue;
    final String stylePropertyName;

    private StyleTokenExtractor(Type tokenStartType, Type tokenEndType,
        StringMap<String> tagToValue, String stylePropertyName) {
      this.tokenStartType = tokenStartType;
      this.tokenEndType = tokenEndType;
      this.tagToValue = tagToValue;
      this.stylePropertyName = stylePropertyName;
    }
  }

  /**
   * Mapping from tag names to corresponding token value.
   */
  private static final StringMap<String> fontWeightMap;
  private static final StringMap<String> fontStyleMap;
  private static final StringMap<String> textDecorationMap;

  static {
    fontWeightMap = CollectionUtils.createStringMap();
    fontWeightMap.put("b", "bold");
    fontWeightMap.put("strong", "bold");

    fontStyleMap = CollectionUtils.createStringMap();
    fontStyleMap.put("i", "italic");
    fontStyleMap.put("em", "italic");

    textDecorationMap = CollectionUtils.createStringMap();
    textDecorationMap.put("u", "underline");
  }

  private static final StyleTokenExtractor FONT_WEIGHT_HANDLER =
      new StyleTokenExtractor(Type.STYLE_FONT_WEIGHT_START, Type.STYLE_FONT_WEIGHT_END,
          fontWeightMap, "fontWeight");

  private static final StyleTokenExtractor FONT_STYLE_HANDLER =
      new StyleTokenExtractor(Type.STYLE_FONT_STYLE_START, Type.STYLE_FONT_STYLE_END, fontStyleMap,
          "fontStyle");

  private static final StyleTokenExtractor TEXT_DECORATION_HANDLER =
      new StyleTokenExtractor(Type.STYLE_TEXT_DECORATION_START, Type.STYLE_TEXT_DECORATION_END,
          textDecorationMap, "textDecoration");

  private final List<Token> tokenList;

  private final int[] activeTokenCounts;

  protected final ElementStyleView<N, E, T> document;

  private int tokenIndex = -1;

  private boolean endBlockPending = false;

  private E root = null;

  private boolean mergeNextNewLine = false;

  /**
   * Creates a tokenizer and parses the inner contents of an Element.
   *
   * @param doc The readable document that will be parsed.
   */
  public RichTextTokenizerImpl(ElementStyleView<N, E, T> doc) {
    document = doc;
    tokenList = new ArrayList<Token>();
    activeTokenCounts = new int[Type.values().length];
    for (int i = 0; i < activeTokenCounts.length; ++i) {
      activeTokenCounts[i] = 0;
    }
    process(doc.getDocumentElement());
  }

  private RichTextTokenizerImpl(RichTextTokenizerImpl<N, E, T> o) {
    activeTokenCounts = new int[o.activeTokenCounts.length];
    for (int i = 0; i < o.activeTokenCounts.length; ++i) {
      activeTokenCounts[i] = o.activeTokenCounts[i];
    }
    tokenList = new ArrayList<Token>(o.tokenList);
    document = o.document;
    tokenIndex = o.tokenIndex;
    endBlockPending = o.endBlockPending;
    root = o.root;
    mergeNextNewLine = o.mergeNextNewLine;
  }

  @Override
  public RichTextTokenizer copy() {
    return new RichTextTokenizerImpl<N, E, T>(this);
  }

  @Override
  public boolean hasNext() {
    return tokenIndex < tokenList.size() - 1;
  }

  @Override
  public Type next() {
    if (!hasNext()) {
      throw new NoSuchElementException();
    }
    ++tokenIndex;
    return getCurrentToken().getType();
  }

  @Override
  public Type getCurrentType() {
    return getCurrentToken().getType();
  }

  @Override
  public String getData() {
    return getCurrentToken().getData();
  }

  private Token getCurrentToken() {
    if (tokenIndex >= tokenList.size()) {
      throw new IllegalStateException("No token available.");
    }
    return tokenList.get(tokenIndex);
  }

  final protected void process(E container) {
    if (container == null) {
      throw new IllegalArgumentException();
    }
    root = container;
    tokenList.clear();
    tokenIndex = -1;

    N prevNode = null;
    for (N child = document.getFirstChild(root); child != null;
        child = document.getNextSibling(child)) {
      processNode(child, prevNode);
      prevNode = child;
    }

    // Sanity check, all counters should be zero.
    for (int j = 0 ; j < activeTokenCounts.length; ++j) {
      assert activeTokenCounts[j] == 0;
    }
  }

  private void processNode(N node, N leftSibling) {
    T textNode = document.asText(node);
    if (textNode != null) {
      processTextNode(textNode, leftSibling);
      return;
    }

    E element = asElement(node);
    if (element != null) {
      processElement(element, leftSibling);
    }
  }

  final void processTextNode(T textNode, N leftSibling) {
    if (endBlockPending) {
      maybeInsertNewline();
      endBlockPending = false;
    }
    processTextNodeInner(textNode, leftSibling);
  }

  protected void processTextNodeInner(T textNode, N leftSibling) {
    String text = document.getData(textNode);
    if (!text.isEmpty()) {
      addTextToken(text);
    }
  }

  final void addTextToken(String text) {
    StringBuilder builder = new StringBuilder(text.length());
    for (int i = 0; i < text.length(); ++i) {
      char ch = text.charAt(i);
      if (ch == '\u00a0') {
        builder.append(' ');
      } else if (ch == '\t') {
        builder.append("    ");
      } else if (ch == '\n') {
        if (builder.length() != 0) {
          addToken(new Token(Type.TEXT, builder.toString()));
        }
        addToken(new Token(Type.NEW_LINE));
        builder = new StringBuilder(text.length() - i);
      } else {
        builder.append(ch);
      }
    }
    if (builder.length() != 0) {
      addToken(new Token(Type.TEXT, builder.toString()));
    }
  }

  private void processElement(E element, N leftSibling) {
    String tagName = document.getTagName(element).toLowerCase();

    List<Type> closingTypeStack = new ArrayList<Type>(); // must only contain annotation closures
    boolean maybeEndParagraph = false;

    boolean setEndBlockPending = false;

    if (isBlockElement(tagName)) {
      if (isListItem(tagName)) {
        addToken(new Token(Type.LIST_ITEM));
      } else {
        // TODO(user): This will always ensure that there cannot be
        // nested paragraphs. However, because <p> and <div> are treated the
        // same, nested divs will result in extra paragraphs. The solution
        // is to flatten the divs when no renderable content exists.
        if (!ignorableBlock(element, tagName)) {
          maybeInsertNewline();
          maybeEndParagraph = true;
          setEndBlockPending = true;
        }
      }
    } else if (isNewline(tagName)) {
      // Special case - if this is the last line break in a paragraph, just
      // ignore it.
      if (!isLastLinebreak(element)) {
        addToken(new Token(Type.NEW_LINE));
        mergeNextNewLine = false;
      }
    } else if (isHeading(tagName)) {
      addToken(new Token(Type.NEW_LINE, tagName));
      maybeEndParagraph = true;
      setEndBlockPending = true;

    } else if (isList(tagName)) {
      putIfNotNull(closingTypeStack, handleListElement(element, tagName));
      maybeEndParagraph = false;
      setEndBlockPending = true;

    } else if (isTableRelated(tagName)) {
      // TODO(patcoleman): temporary table rendering, replace with real tables once supported.
      if (isTable(tagName)) {
        addToken(new Token(Type.NEW_LINE));
      } else if (isTableRow(tagName)) {
        maybeEndParagraph = true;
        setEndBlockPending = true;
      } else if (isTableCell(tagName)) {
        addToken(new Token(Type.TEXT, " "));
      }

    } else {
      putIfNotNull(closingTypeStack, handleLinkElement(element, tagName));
    }

    handleStyleElements(element, tagName, closingTypeStack);

    // Recursively iterate children.
    N prevNode = null;
    for (N child = document.getFirstChild(element); child != null;
        child = document.getNextSibling(child)) {
      processNode(child, prevNode);
      prevNode = child;
    }

    while (!closingTypeStack.isEmpty()) {
      Type closingType = closingTypeStack.remove(closingTypeStack.size() - 1);
      addToken(new Token(closingType));
      decrementTypeCounter(closingType);
    }

    endBlockPending |= setEndBlockPending;

    if (maybeEndParagraph && endBlockPending) {
      maybeInsertNewline();
      endBlockPending = false;
    }
  }

  private <E> void putIfNotNull(List<E> list, E item) {
    if (item != null) {
      list.add(item);
    }
  }

  private boolean maybeInsertNewline() {
    // Only add the newline if we didn't just end a block with a close tag.
    if (!mergeNextNewLine) {
      addToken(new Token(Type.NEW_LINE));
      return true;
    }
    return false;
  }

  /**
   * Checks the element for various style properties in its tag name or css styles.
   * All styles found generate a starting token, plus add an end token to the closing stack.
   */
  private void handleStyleElements(E el, String tagName, List<Type> closeStack) {
    Type startType = null;
    Type endType = null;
    String data = null;

    // Styles supported here: bold, italic, font colour, background colour, font family.
    maybeExtractStyleToken(el, tagName, closeStack, FONT_WEIGHT_HANDLER);
    maybeExtractStyleToken(el, tagName, closeStack, FONT_STYLE_HANDLER);
    maybeExtractStyleToken(el, tagName, closeStack, TEXT_DECORATION_HANDLER);

    if (isColor(el)) {
      String value = document.getStylePropertyValue(el, "color");
      addTokenOrIncrement(true, new Token(Type.STYLE_COLOR_START, value), Type.STYLE_COLOR_END);
      closeStack.add(Type.STYLE_COLOR_END);
    }
    if (isBackgroundColor(el)) {
      String value = document.getStylePropertyValue(el, "backgroundColor");
      addTokenOrIncrement(true,
          new Token(Type.STYLE_BG_COLOR_START, value), Type.STYLE_BG_COLOR_END);
      closeStack.add(Type.STYLE_BG_COLOR_END);
    }
    if (isFontFamily(el)) {
      String value = document.getStylePropertyValue(el, "fontFamily");
      addTokenOrIncrement(true, new Token(Type.STYLE_FONT_FAMILY_START, value),
          Type.STYLE_FONT_FAMILY_END);
      closeStack.add(Type.STYLE_FONT_FAMILY_END);
    }
  }

  private Type handleLinkElement(E el, String tagName) {
    if (isLink(tagName)) {
      String attr = document.getAttribute(el, "href");
      if (attr != null) {
        addTokenOrIncrement(new Token(Type.LINK_START, attr), Type.LINK_END);
        return Type.LINK_END;
      }
    }
    return null;
  }

  private Type handleListElement(E element, String tagName) {
    if (isOrderedList(tagName)) {
      addTokenOrIncrement(true, new Token(Type.ORDERED_LIST_START), Type.ORDERED_LIST_END);
      return Type.ORDERED_LIST_END;
    } else if (isUnorderedList(tagName)) {
      addTokenOrIncrement(true, new Token(Type.UNORDERED_LIST_START), Type.UNORDERED_LIST_END);
      return Type.UNORDERED_LIST_END;
    } else {
      return null;
    }
  }

  private void addToken(Token token) {
    if (token.getType().isStructural()) {
      mergeNextNewLine = token.getType().group() == TypeGroup.BLOCK;
    }
    tokenList.add(token);
  }

  private void addTokenOrIncrement(Token token, Type endType) {
    addTokenOrIncrement(false, token, endType);
  }

  private void addTokenOrIncrement(boolean replace, Token token, Type endType) {
    if (replace || !isTypeInUse(endType)) {
      addToken(token);
    }
    incrementTypeCounter(endType);
  }

  private E asElement(N node) {
    if (node != null) {
      return document.asElement(node);
    }
    return null;
  }

  /**
   * Checks if this is the last line break before a new paragraph.
   */
  private boolean isLastLinebreak(E element) {
    N sibling = document.getNextSibling(element);
    // If we are not the last child and do not border a block element,
    // then we cannot ignore this linebreak.
    if (sibling != null) {
      E el = asElement(sibling);
      if (el == null || !isBlockElement(document.getTagName(el))) {
        return false;
      }
    }

    // Make sure we're not deeply nested.
    return getDepthFromBlock(element) == 1;
  }

  private boolean ignorableBlock(E element, String tagName) {
    // Ignore empty divs.
    if ("div".equalsIgnoreCase(tagName)) {
      if (document.getFirstChild(element) == null) {
        return true;
      }
      /*
      int offsetHeight = element.getOffsetHeight();
      if (element.getOffsetHeight() == 0) {
        return true;
      }
      */
    }
    return false;
  }

  private int getDepthFromBlock(N node) {
    int depth = 1;
    E e = document.getParentElement(node);
    while (e != null) {
      if (e == root || isBlockElement(document.getTagName(e))) {
        break;
      }
      depth++;
      e = document.getParentElement(e);
    }
    return depth;
  }

  private boolean isTypeInUse(Type type) {
    return activeTokenCounts[type.ordinal()] > 0;
  }

  private void incrementTypeCounter(Type type) {
    ++activeTokenCounts[type.ordinal()];
  }

  private boolean decrementTypeCounter(Type type) {
    int newValue = --activeTokenCounts[type.ordinal()];
    assert newValue >= 0;
    return newValue == 0;
  }

  private static boolean isBlockElement(String tagName) {
    return "p".equalsIgnoreCase(tagName) ||
        "div".equalsIgnoreCase(tagName) || isListItem(tagName);
  }

  private static boolean isNewline(String tagName) {
    return "br".equalsIgnoreCase(tagName);
  }

  private void maybeExtractStyleToken(E el, String tagName, List<Type> closeStack,
      StyleTokenExtractor tokenHandler) {
    String styleProperty = getStyleProperty(el, tagName, tokenHandler);

    if (styleProperty != null) {
      addTokenOrIncrement(true, new Token(tokenHandler.tokenStartType, styleProperty),
          tokenHandler.tokenEndType);
      closeStack.add(tokenHandler.tokenEndType);
    }
  }

  private String getStyleProperty(E el, String tagName, StyleTokenExtractor tokenHandler) {
    String value = document.getStylePropertyValue(el, tokenHandler.stylePropertyName);
    if (value != null && !value.isEmpty()) {
      return value;
    }

    String lowerCaseTag = tagName.toLowerCase();
    return tokenHandler.tagToValue.containsKey(lowerCaseTag) ? tokenHandler.tagToValue
        .get(lowerCaseTag) : null;
  }

  private boolean isColor(E el) {
    return isStylePropertySet(el, "color");
  }

  private boolean isBackgroundColor(E el) {
    return isStylePropertySet(el, "backgroundColor");
  }

  private boolean isFontFamily(E el) {
    return isStylePropertySet(el, "fontFamily");
  }

  private static boolean isHeading(String tagName) {
    if (tagName.length() == 2) {
      if (tagName.charAt(0) == 'h' || tagName.charAt(0) == 'H') {
        int size = tagName.charAt(1) - '0';
        if (size >= 1 && size <= 4) {
          return true;
        }
      }
    }
    return false;
  }

  private static boolean isLink(String tagName) {
    return "a".equalsIgnoreCase(tagName);
  }

  private static boolean isListItem(String tagName) {
    return "li".equalsIgnoreCase(tagName);
  }

  private static boolean isList(String tagName) {
    return isOrderedList(tagName) || isUnorderedList(tagName);
  }

  private static boolean isOrderedList(String tagName) {
    return "ol".equalsIgnoreCase(tagName);
  }

  private static boolean isUnorderedList(String tagName) {
    return "ul".equalsIgnoreCase(tagName);
  }

  private static boolean isTable(String tagName) {
    return "table".equalsIgnoreCase(tagName);
  }

  private static boolean isTableRow(String tagName) {
    return "tr".equalsIgnoreCase(tagName);
  }

  private static boolean isTableCell(String tagName) {
    return "th".equalsIgnoreCase(tagName) || "td".equalsIgnoreCase(tagName);
  }

  private static boolean isTableRelated(String tagName) {
    // TODO(patcoleman): fix up table implementation once tables supported in the editor.
    // When this happens, also extract out strings into symbolic constants.
    return isTable(tagName) || isTableRow(tagName) || isTableCell(tagName) ||
           "thead".equalsIgnoreCase(tagName) ||
           "tbody".equalsIgnoreCase(tagName);
  }

  private boolean isStylePropertySet(E el, String property) {
    String value = document.getStylePropertyValue(el, property);
    return value != null && !value.isEmpty();
  }

  @Override
  public String toString() {
    return tokenList.toString();
  }
}