/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
package com.swabunga.spell.event;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import javax.swing.text.AttributeSet;
import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.Segment;
import javax.swing.text.StyledDocument;
/**
* This class tokenizes a swing document model. It also allows for the document model to be changed when corrections occur.
*
* @author Jason Height (jheight@chariot.net.au)
*/
public class DocumentWordTokenizer implements WordTokenizer {
/** Holds the start character position of the current word */
private int currentWordPos = 0;
/** Holds the end character position of the current word */
private int currentWordEnd = 0;
/** Holds the start character position of the next word */
private int nextWordPos = -1;
/** The actual text that is being tokenized */
private Document document;
/** The character iterator over the document */
private Segment text;
/** The cumulative word count that have been processed */
private int wordCount = 0;
/** Flag indicating if there are any more tokens (words) left */
private boolean moreTokens = true;
/**
* Is this a special case where the currentWordStart, currntWordEnd and nextWordPos have already been calculated. (see nextWord)
*/
private boolean first = true;
private BreakIterator sentenceIterator;
private boolean startsSentence = true;
public DocumentWordTokenizer(Document document) {
this.document = document;
// Create a text segment over the etire document
text = new Segment();
sentenceIterator = BreakIterator.getSentenceInstance();
try {
document.getText(0, document.getLength(), text);
sentenceIterator.setText(text);
currentWordPos = getNextWordStart(text, text.getBeginIndex());
// If the current word pos is -1 then the string was all white space
if (currentWordPos != -1) {
currentWordEnd = getNextWordEnd(text, currentWordPos);
nextWordPos = getNextWordStart(text, currentWordEnd);
} else {
moreTokens = false;
}
} catch (BadLocationException ex) {
moreTokens = false;
}
}
/**
* This helper method will return the start character of the next word in the buffer from the start position
*/
private static int getNextWordStart(Segment text, int startPos) {
if (startPos <= text.getEndIndex()) {
for (char ch = text.setIndex(startPos); ch != CharacterIterator.DONE; ch = text.next()) {
if (Character.isLetterOrDigit(ch)) {
return text.getIndex();
}
}
}
return -1;
}
/**
* This helper method will return the end of the next word in the buffer.
*
*/
private static int getNextWordEnd(Segment text, int startPos) {
for (char ch = text.setIndex(startPos); ch != CharacterIterator.DONE; ch = text.next()) {
if (!Character.isLetterOrDigit(ch)) {
if (ch == '-' || ch == '\'') { // handle ' and - inside words
char ch2 = text.next();
text.previous();
if (ch2 != CharacterIterator.DONE && Character.isLetterOrDigit(ch2)) {
continue;
}
}
return text.getIndex();
}
}
return text.getEndIndex();
}
/**
* Returns true if there are more words that can be processed in the string
*
*/
@Override
public boolean hasMoreWords() {
return moreTokens;
}
/**
* Sets the current word position at the start of the word containing the char at position pos. This way a call to nextWord() will
* return this word.
*
* @param pos
* position in the word we want to set as current.
*/
public void posStartFullWordFrom(int pos) {
currentWordPos = text.getBeginIndex();
if (pos > text.getEndIndex()) {
pos = text.getEndIndex();
}
for (char ch = text.setIndex(pos); ch != CharacterIterator.DONE; ch = text.previous()) {
if (!Character.isLetterOrDigit(ch)) {
if (ch == '-' || ch == '\'') { // handle ' and - inside words
char ch2 = text.previous();
text.next();
if (ch2 != CharacterIterator.DONE && Character.isLetterOrDigit(ch2)) {
continue;
}
}
currentWordPos = text.getIndex() + 1;
break;
}
}
// System.out.println("CurPos:"+currentWordPos);
if (currentWordPos == 0) {
first = true;
}
moreTokens = true;
currentWordEnd = getNextWordEnd(text, currentWordPos);
nextWordPos = getNextWordStart(text, currentWordEnd + 1);
}
/**
* Returns the current character position in the text
*
*/
@Override
public int getCurrentWordPosition() {
return currentWordPos;
}
/**
* Returns the current end word position in the text
*
*/
@Override
public int getCurrentWordEnd() {
return currentWordEnd;
}
/**
* Returns the next word in the text
*
*/
@Override
public String nextWord() {
if (!first) {
currentWordPos = nextWordPos;
currentWordEnd = getNextWordEnd(text, currentWordPos);
nextWordPos = getNextWordStart(text, currentWordEnd + 1);
}
int current = sentenceIterator.current();
if (current == currentWordPos) {
startsSentence = true;
} else {
startsSentence = false;
if (currentWordEnd > current) {
sentenceIterator.next();
}
}
// The nextWordPos has already been populated
String word = null;
try {
word = document.getText(currentWordPos, currentWordEnd - currentWordPos);
} catch (BadLocationException ex) {
moreTokens = false;
}
wordCount++;
first = false;
if (nextWordPos == -1) {
moreTokens = false;
}
return word;
}
/**
* Returns the current number of words that have been processed
*
*/
@Override
public int getCurrentWordCount() {
return wordCount;
}
/** Replaces the current word token */
@Override
public void replaceWord(String newWord) {
AttributeSet attr = null;
if (currentWordPos != -1) {
try {
if (document instanceof StyledDocument) {
attr = ((StyledDocument) document).getCharacterElement(currentWordPos).getAttributes();
}
document.remove(currentWordPos, currentWordEnd - currentWordPos);
document.insertString(currentWordPos, newWord, attr);
// Need to reset the segment
document.getText(0, document.getLength(), text);
} catch (BadLocationException ex) {
throw new RuntimeException(ex.getMessage());
}
// Position after the newly replaced word(s)
first = true;
currentWordPos = getNextWordStart(text, currentWordPos + newWord.length());
if (currentWordPos != -1) {
currentWordEnd = getNextWordEnd(text, currentWordPos);
nextWordPos = getNextWordStart(text, currentWordEnd);
sentenceIterator.setText(text);
sentenceIterator.following(currentWordPos);
} else {
moreTokens = false;
}
}
}
/**
* Returns the current text that is being tokenized (includes any changes that have been made)
*/
@Override
public String getContext() {
return text.toString();
}
/** Returns true if the current word is at the start of a sentence */
@Override
public boolean isNewSentence() {
// BreakIterator doesn't work when the first word in a sentence is not capitalised,
// but we need to check for capitalisation
if (startsSentence || currentWordPos < 2) {
return true;
}
String textBefore = null;
try {
textBefore = document.getText(currentWordPos - 2, 2);
} catch (BadLocationException ex) {
return false;
}
return textBefore != null && ".".equals(textBefore.trim());
}
}