TitleExtractor.java example

Explorer
WaveInCloud-master
/**
 * Copyright 2008 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.waveprotocol.wave.model.util;

@Deprecated
public class TitleExtractor {

  /**
   * Extracts a title from an XML string.
   * The string may be either an entire document (that contains a title
   * element) or the inner XML of a title element).
   *
   * @param rich  an XML string
   * @return a title extracted from {@code rich}.  The string is element-free,
   *         escaped XML.
   */
  public static String extractTitle(String rich) {
    return firstPhrase(rich);
  }

  /**
   * Tests if an XML string contains only a title.
   * Essentially, this just tests if the text view of the XML string contains
   * only a single sentence.
   *
   * @param rich  an XML string
   * @return true if {@code rich} contains just a single sentence.
   */
  public static boolean isOnlyTitle(String rich) {
    String titleIsh = stripWhite(processForFirstPhrase(rich));
    String title = extractTitle(rich);
    return titleIsh.equals(title);
  }

  private static String firstPhrase(String rich) {
    String processed = processForFirstPhrase(rich);
    int stop = processed.indexOf('\n');
    return stripWhite(stop != -1 ? processed.substring(0, stop) : processed);
  }

  private static String processForFirstPhrase(String rich) {
    // Place a \n at first <br>
    String text = rich.replaceFirst("<br.*?>", "\n")
      // Place a \n at first closing p
      .replaceFirst("</p>", "\n")
      // Place a \n at first stop char, followed by whitespace
      .replaceFirst("((\\.|\\?|!)+(\\s))", "$1\n");

    return stripTags(text);
  }

  /**
   *  Remove opening and closing tags.
   */
  private static String stripTags(String rich) {
    // This was done via a rich.replaceAll("<(.|\\n)+?>", "");
    // which would cause a StackOverflowError in the indexer when a root blip contained a large
    // tag e.g. a gadget.

    StringBuilder b = new StringBuilder();

    int start = 0;
    int open = rich.indexOf('<');
    if (open < 0) {
      return rich;
    }
    while(start >= 0 && start < rich.length()) {
      if (open < 0) {
        // Append all the rest.
        b.append(rich.substring(start));
        break;
      }
      if (open > start) {
        // Append the chars between the start and the open.
        b.append(rich.substring(start, open));
      }
      // jump to the next
      start = rich.indexOf('>', open);
      if (start > 0) {
        // skip the '>'
        ++start;
      }
      open = rich.indexOf('<', start);
    }
    return b.toString();
  }

  /**
   * Strips leading and trailing whitespace.
   */
  private static String stripWhite(String text) {
    return text.replaceAll("^(\\s|\u00a0)+|\\s+$", "");
  }
}