package com.swabunga.spell.event; import java.util.*; import java.text.*; /** This class tokenizes a input string. * <p> * It also allows for the string to be mutated. The result after the spell * checking is completed is available to the call to getFinalText</p> * * @author Jason Height (jheight@chariot.net.au) */ public class StringWordTokenizer implements WordTokenizer { /** Holds the start character position of the current word*/ private int currentWordPos = 0; /** Holds the end character position of the current word*/ private int currentWordEnd = 0; /** Holds the start character position of the next word*/ private int nextWordPos = -1; /** The actual text that is being tokenized*/ private StringBuffer text; /** The cumulative word count that have been processed*/ private int wordCount = 0; /** Flag indicating if there are any more tokens (words) left*/ private boolean moreTokens = true; /** Is this a special case where the currentWordStart, currntWordEnd and * nextWordPos have already been calculated. (see nextWord) */ private boolean first = true; private BreakIterator sentanceIterator; private boolean startsSentance = true; public StringWordTokenizer(String text) { sentanceIterator = BreakIterator.getSentenceInstance(); sentanceIterator.setText(text); sentanceIterator.first(); //Wrap a string buffer to hopefully make things a bit easier and efficient to //replace words etc. this.text = new StringBuffer(text); currentWordPos = getNextWordStart(this.text, 0); //If the current word pos is -1 then the string was all white space if (currentWordPos != -1) { currentWordEnd = getNextWordEnd(this.text, currentWordPos); nextWordPos = getNextWordStart(this.text, currentWordEnd); } else { moreTokens = false; } } /** This helper method will return the start character of the next * word in the buffer from the start position */ private static int getNextWordStart(StringBuffer text, int startPos) { int size = text.length(); for (int i=startPos;i<size;i++) { if (Character.isLetterOrDigit(text.charAt(i))) { return i; } } return -1; } /** This helper method will return the end of the next word in the buffer. * */ private static int getNextWordEnd(StringBuffer text, int startPos) { int size = text.length(); for (int i=startPos;i<size;i++) { if (!Character.isLetterOrDigit(text.charAt(i))) { return i; } } return size; } /** Returns true if there are more words that can be processed in the string * */ public boolean hasMoreWords() { return moreTokens; } /** Returns the current character position in the text * */ public int getCurrentWordPosition() { return currentWordPos; } /** Returns the current end word position in the text * */ public int getCurrentWordEnd() { return currentWordEnd; } /** Returns the next word in the text * */ public String nextWord() { if (!first) { currentWordPos = nextWordPos; currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd+1); int current = sentanceIterator.current(); if (current == currentWordPos) startsSentance = true; else { startsSentance = false; if (currentWordEnd > current) sentanceIterator.next(); } } //The nextWordPos has already been populated String word = text.substring(currentWordPos, currentWordEnd); wordCount++; first = false; if (nextWordPos == -1) moreTokens = false; return word; } /** Returns the current number of words that have been processed * */ public int getCurrentWordCount() { return wordCount; } /** Replaces the current word token*/ public void replaceWord(String newWord) { if (currentWordPos != -1) { text.replace(currentWordPos, currentWordEnd, newWord); //Position after the newly replaced word(s) first = true; currentWordPos = getNextWordStart(text, currentWordPos+newWord.length()); if (currentWordPos != -1) { currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd); sentanceIterator.setText(text.toString()); sentanceIterator.following(currentWordPos); } else moreTokens = false; } } /** returns true iif the current word is at the start of a sentance*/ public boolean isNewSentance() { return startsSentance; } /** Returns the current text that is being tokenized (includes any changes * that have been made) */ public String getContext() { return text.toString(); } /** This method can be used to return the final text after the schecking is complete.*/ public String getFinalText() { return getContext(); } // public static void main(String args[]) { // StringWordTokenizer t = new StringWordTokenizer(" This is a test problem"); // while(t.hasMoreWords()) { // String word = t.nextWord(); // System.out.println("Word is '"+word+"'"); // if ("test".equals(word)) t.replaceWord("mightly big"); // } // System.out.println("End text is: '"+t.getFinalText()+"'"); // // t = new StringWordTokenizer(" README "); // while(t.hasMoreWords()) { // String word = t.nextWord(); // System.out.println("Word is '"+word+"'"); // } // System.out.println("End text is: '"+t.getFinalText()+"'"); // // t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance."); // while(t.hasMoreWords()) { // String word = t.nextWord(); // System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance()); // if (word.equals("acronym")) // t.replaceWord("test"); // } // } }