/* Jazzy - a Java library for Spell Checking Copyright (C) 2001 Mindaugas Idzelis Full text of license can be found in LICENSE.txt This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package com.swabunga.spell.event; import java.text.BreakIterator; import java.text.CharacterIterator; import javax.swing.text.AttributeSet; import javax.swing.text.BadLocationException; import javax.swing.text.Document; import javax.swing.text.Segment; import javax.swing.text.StyledDocument; /** * This class tokenizes a swing document model. It also allows for the document model to be changed when corrections occur. * * @author Jason Height (jheight@chariot.net.au) */ public class DocumentWordTokenizer implements WordTokenizer { /** Holds the start character position of the current word */ private int currentWordPos = 0; /** Holds the end character position of the current word */ private int currentWordEnd = 0; /** Holds the start character position of the next word */ private int nextWordPos = -1; /** The actual text that is being tokenized */ private Document document; /** The character iterator over the document */ private Segment text; /** The cumulative word count that have been processed */ private int wordCount = 0; /** Flag indicating if there are any more tokens (words) left */ private boolean moreTokens = true; /** * Is this a special case where the currentWordStart, currntWordEnd and nextWordPos have already been calculated. (see nextWord) */ private boolean first = true; private BreakIterator sentenceIterator; private boolean startsSentence = true; public DocumentWordTokenizer(Document document) { this.document = document; // Create a text segment over the etire document text = new Segment(); sentenceIterator = BreakIterator.getSentenceInstance(); try { document.getText(0, document.getLength(), text); sentenceIterator.setText(text); currentWordPos = getNextWordStart(text, text.getBeginIndex()); // If the current word pos is -1 then the string was all white space if (currentWordPos != -1) { currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd); } else { moreTokens = false; } } catch (BadLocationException ex) { moreTokens = false; } } /** * This helper method will return the start character of the next word in the buffer from the start position */ private static int getNextWordStart(Segment text, int startPos) { if (startPos <= text.getEndIndex()) { for (char ch = text.setIndex(startPos); ch != CharacterIterator.DONE; ch = text.next()) { if (Character.isLetterOrDigit(ch)) { return text.getIndex(); } } } return -1; } /** * This helper method will return the end of the next word in the buffer. * */ private static int getNextWordEnd(Segment text, int startPos) { for (char ch = text.setIndex(startPos); ch != CharacterIterator.DONE; ch = text.next()) { if (!Character.isLetterOrDigit(ch)) { if (ch == '-' || ch == '\'') { // handle ' and - inside words char ch2 = text.next(); text.previous(); if (ch2 != CharacterIterator.DONE && Character.isLetterOrDigit(ch2)) { continue; } } return text.getIndex(); } } return text.getEndIndex(); } /** * Returns true if there are more words that can be processed in the string * */ @Override public boolean hasMoreWords() { return moreTokens; } /** * Sets the current word position at the start of the word containing the char at position pos. This way a call to nextWord() will * return this word. * * @param pos * position in the word we want to set as current. */ public void posStartFullWordFrom(int pos) { currentWordPos = text.getBeginIndex(); if (pos > text.getEndIndex()) { pos = text.getEndIndex(); } for (char ch = text.setIndex(pos); ch != CharacterIterator.DONE; ch = text.previous()) { if (!Character.isLetterOrDigit(ch)) { if (ch == '-' || ch == '\'') { // handle ' and - inside words char ch2 = text.previous(); text.next(); if (ch2 != CharacterIterator.DONE && Character.isLetterOrDigit(ch2)) { continue; } } currentWordPos = text.getIndex() + 1; break; } } // System.out.println("CurPos:"+currentWordPos); if (currentWordPos == 0) { first = true; } moreTokens = true; currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd + 1); } /** * Returns the current character position in the text * */ @Override public int getCurrentWordPosition() { return currentWordPos; } /** * Returns the current end word position in the text * */ @Override public int getCurrentWordEnd() { return currentWordEnd; } /** * Returns the next word in the text * */ @Override public String nextWord() { if (!first) { currentWordPos = nextWordPos; currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd + 1); } int current = sentenceIterator.current(); if (current == currentWordPos) { startsSentence = true; } else { startsSentence = false; if (currentWordEnd > current) { sentenceIterator.next(); } } // The nextWordPos has already been populated String word = null; try { word = document.getText(currentWordPos, currentWordEnd - currentWordPos); } catch (BadLocationException ex) { moreTokens = false; } wordCount++; first = false; if (nextWordPos == -1) { moreTokens = false; } return word; } /** * Returns the current number of words that have been processed * */ @Override public int getCurrentWordCount() { return wordCount; } /** Replaces the current word token */ @Override public void replaceWord(String newWord) { AttributeSet attr = null; if (currentWordPos != -1) { try { if (document instanceof StyledDocument) { attr = ((StyledDocument) document).getCharacterElement(currentWordPos).getAttributes(); } document.remove(currentWordPos, currentWordEnd - currentWordPos); document.insertString(currentWordPos, newWord, attr); // Need to reset the segment document.getText(0, document.getLength(), text); } catch (BadLocationException ex) { throw new RuntimeException(ex.getMessage()); } // Position after the newly replaced word(s) first = true; currentWordPos = getNextWordStart(text, currentWordPos + newWord.length()); if (currentWordPos != -1) { currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd); sentenceIterator.setText(text); sentenceIterator.following(currentWordPos); } else { moreTokens = false; } } } /** * Returns the current text that is being tokenized (includes any changes that have been made) */ @Override public String getContext() { return text.toString(); } /** Returns true if the current word is at the start of a sentence */ @Override public boolean isNewSentence() { // BreakIterator doesn't work when the first word in a sentence is not capitalised, // but we need to check for capitalisation if (startsSentence || currentWordPos < 2) { return true; } String textBefore = null; try { textBefore = document.getText(currentWordPos - 2, 2); } catch (BadLocationException ex) { return false; } return textBefore != null && ".".equals(textBefore.trim()); } }