SimpleTokenize.java example

Explorer
Stanford-NLP-master
- CoreNLP-master

package edu.stanford.nlp.ie.machinereading.common; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.StringTokenizer;

/**
 * Simple string tokenization
 */
public class SimpleTokenize  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(SimpleTokenize.class);
  /** Basic string tokenization, skipping over white spaces */
  public static ArrayList<String> tokenize(String line) {
    ArrayList<String> tokens = new ArrayList<>();
    StringTokenizer tokenizer = new StringTokenizer(line);
    while (tokenizer.hasMoreElements()) {
      tokens.add(tokenizer.nextToken());
    }
    return tokens;
  }

  /** Basic string tokenization, skipping over white spaces */
  public static ArrayList<String> tokenize(String line, String separators) {
    ArrayList<String> tokens = new ArrayList<>();
    StringTokenizer tokenizer = new StringTokenizer(line, separators);
    while (tokenizer.hasMoreElements()) {
      tokens.add(tokenizer.nextToken());
    }
    return tokens;
  }

  /**
   * Finds the first non-whitespace character starting at start
   */
  private static int findNonWhitespace(String s, int start) {
    for (; start < s.length(); start++) {
      if (Character.isWhitespace(s.charAt(start)) == false)
        return start;
    }
    return -1;
  }

  private static int findWhitespace(String s, int start) {
    for (; start < s.length(); start++) {
      if (Character.isWhitespace(s.charAt(start)))
        return start;
    }
    return -1;
  }

  /**
   * Replaces all occurences of \" with "
   */
  private static String normalizeQuotes(String str) {
    StringBuffer buffer = new StringBuffer();
    for (int i = 0; i < str.length(); i++) {
      // do not include \ if followed by "
      if (str.charAt(i) == '\\' && i < str.length() - 1 && str.charAt(i + 1) == '\"') {
        continue;
      } else {
        buffer.append(str.charAt(i));
      }
    }
    return buffer.toString();
  }

  /**
   * String tokenization, considering everything within quotes as 1 token
   * Regular quotes inside tokens MUST be preceded by \
   */
  public static ArrayList<String> tokenizeWithQuotes(String line) {
    ArrayList<String> tokens = new ArrayList<>();
    int position = 0;

    while ((position = findNonWhitespace(line, position)) != -1) {
      int end = -1;

      // found quoted token (not preceded by \)
      if (line.charAt(position) == '\"' && (position == 0 || line.charAt(position - 1) != '\\')) {

        // find the first quote not preceded by \
        int current = position;
        for (;;) {
          // found end of string first
          if ((end = line.indexOf('\"', current + 1)) == -1) {
            end = line.length();
            break;
          } else { // found a quote
            if (line.charAt(end - 1) != '\\') { // valid quote
              end++;
              break;
            } else { // quote preceded by \
              current = end;
            }
          }
        }

        // do not include the quotes in the token
        tokens.add(normalizeQuotes(line.substring(position + 1, end - 1)));
      }

      // regular token
      else {
        if ((end = findWhitespace(line, position + 1)) == -1)
          end = line.length();

        tokens.add(new String(line.substring(position, end)));
      }

      position = end;
    }

    return tokens;
  }

  /**
   * Constructs a valid quote-surrounded token All inside quotes are preceded by
   * \
   */
  public static String quotify(String str) {
    StringBuffer buffer = new StringBuffer();
    buffer.append('\"');
    for (int i = 0; i < str.length(); i++) {
      if (str.charAt(i) == '\"')
        buffer.append('\\');
      buffer.append(str.charAt(i));
    }
    buffer.append('\"');
    return buffer.toString();
  }

  /** Implements a simple test */
  public static void main(String[] argv) {
    String in = "T \"Athens \\\"the beautiful\\\"\" \"Athens\" \"\" \"Greece\"";
    log.info("Input: " + in);
    log.info(tokenizeWithQuotes(in));
  }
}