StringWordTokenizer.java example

Explorer
ekit-master
- src
  - com
package com.swabunga.spell.event;

import java.util.*;
import java.text.*;

/** This class tokenizes a input string.
 *  <p>
 *  It also allows for the string to be mutated. The result after the spell
 *  checking is completed is available to the call to getFinalText</p>
 *
 * @author Jason Height (jheight@chariot.net.au)
 */
public class StringWordTokenizer implements WordTokenizer {
  /** Holds the start character position of the current word*/
  private int currentWordPos = 0;
  /** Holds the end character position of the current word*/
  private int currentWordEnd = 0;
  /** Holds the start character position of the next word*/
  private int nextWordPos = -1;
  /** The actual text that is being tokenized*/
  private StringBuffer text;
  /** The cumulative word count that have been processed*/
  private int wordCount = 0;
  /** Flag indicating if there are any more tokens (words) left*/
  private boolean moreTokens = true;
  /** Is this a special case where the currentWordStart, currntWordEnd and
   *  nextWordPos have already been calculated. (see nextWord)
   */
  private boolean first = true;

  private BreakIterator sentanceIterator;
  private boolean startsSentance = true;


  public StringWordTokenizer(String text) {
    sentanceIterator = BreakIterator.getSentenceInstance();
    sentanceIterator.setText(text);
    sentanceIterator.first();
    //Wrap a string buffer to hopefully make things a bit easier and efficient to
    //replace words etc.
    this.text = new StringBuffer(text);
    currentWordPos = getNextWordStart(this.text, 0);
    //If the current word pos is -1 then the string was all white space
    if (currentWordPos != -1) {
      currentWordEnd = getNextWordEnd(this.text, currentWordPos);
      nextWordPos = getNextWordStart(this.text, currentWordEnd);
    } else {
      moreTokens = false;
    }
  }

  /** This helper method will return the start character of the next
   * word in the buffer from the start position
   */
  private static int getNextWordStart(StringBuffer text, int startPos) {
    int size = text.length();
    for (int i=startPos;i<size;i++) {
      if (Character.isLetterOrDigit(text.charAt(i))) {
        return i;
      }
    }
    return -1;
  }

  /** This helper method will return the end of the next word in the buffer.
   *
   */
  private static int getNextWordEnd(StringBuffer text, int startPos) {
    int size = text.length();
    for (int i=startPos;i<size;i++) {
      if (!Character.isLetterOrDigit(text.charAt(i))) {
        return i;
      }
    }
    return size;
  }


  /** Returns true if there are more words that can be processed in the string
   *
   */
  public boolean hasMoreWords() {
    return moreTokens;
  }

  /** Returns the current character position in the text
   *
   */
  public int getCurrentWordPosition() {
    return currentWordPos;
  }

  /** Returns the current end word position in the text
   *
   */
  public int getCurrentWordEnd() {
    return currentWordEnd;
  }

  /** Returns the next word in the text
   *
   */
  public String nextWord() {
    if (!first) {
      currentWordPos = nextWordPos;
      currentWordEnd = getNextWordEnd(text, currentWordPos);
      nextWordPos = getNextWordStart(text, currentWordEnd+1);
      int current = sentanceIterator.current();
      if (current == currentWordPos)
        startsSentance = true;
      else {
        startsSentance = false;
        if (currentWordEnd > current)
          sentanceIterator.next();
      }
    }
    //The nextWordPos has already been populated
    String word = text.substring(currentWordPos, currentWordEnd);
    wordCount++;
    first = false;
    if (nextWordPos == -1)
      moreTokens = false;
    return word;
  }

  /** Returns the current number of words that have been processed
   *
   */
  public int getCurrentWordCount() {
    return wordCount;
  }

  /** Replaces the current word token*/
  public void replaceWord(String newWord) {
    if (currentWordPos != -1) {
      text.replace(currentWordPos, currentWordEnd, newWord);
      //Position after the newly replaced word(s)
      first = true;
      currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());
      if (currentWordPos != -1) {
        currentWordEnd = getNextWordEnd(text, currentWordPos);
        nextWordPos = getNextWordStart(text, currentWordEnd);
        sentanceIterator.setText(text.toString());
        sentanceIterator.following(currentWordPos);
      } else moreTokens = false;
    }
  }

  /** returns true iif the current word is at the start of a sentance*/
  public boolean isNewSentance() {
    return startsSentance;
  }

  /** Returns the current text that is being tokenized (includes any changes
   *  that have been made)
   */
  public String getContext() {
    return text.toString();
  }

  /** This method can be used to return the final text after the schecking is complete.*/
  public String getFinalText() {
    return getContext();
  }


//  public static void main(String args[]) {
//    StringWordTokenizer t = new StringWordTokenizer("  This is a  test   problem");
//    while(t.hasMoreWords()) {
//      String word = t.nextWord();
//      System.out.println("Word is '"+word+"'");
//      if ("test".equals(word)) t.replaceWord("mightly big");
//    }
//    System.out.println("End text is: '"+t.getFinalText()+"'");
//
//    t = new StringWordTokenizer("    README   ");
//    while(t.hasMoreWords()) {
//      String word = t.nextWord();
//      System.out.println("Word is '"+word+"'");
//    }
//    System.out.println("End text is: '"+t.getFinalText()+"'");
//
//    t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance.");
//    while(t.hasMoreWords()) {
//      String word = t.nextWord();
//      System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance());
//      if (word.equals("acronym"))
//        t.replaceWord("test");
//    }
//  }
}