/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.tokenizer; import java.text.BreakIterator; import java.util.LinkedList; import org.omegat.util.PatternConsts; /** * BreakIterator for word-breaks with OmegaT heuristics, based on an instance of * BreakIterator implementing word breaks. * * @see java.text.BreakIterator#getWordInstance * @author Maxym Mykhalchuk */ public class WordIterator extends BreakIterator { BreakIterator breaker; String text; /** Creates a new instance of OmegaT's own word BreakIterator */ public WordIterator() { breaker = BreakIterator.getWordInstance(); } /** * Set a new text string to be scanned. The current scan position is reset * to first(). * * @param newText * new text to scan. */ public void setText(String newText) { text = newText; breaker.setText(newText); nextItems.clear(); } /** * Return the first boundary. The iterator's current position is set to the * first boundary. * * @return The character index of the first text boundary. */ public int first() { return breaker.first(); } /** * Return character index of the text boundary that was most recently * returned by next(), previous(), first(), or last() * * @return The boundary most recently returned. */ public int current() { return breaker.current(); } LinkedList<Integer> nextItems = new LinkedList<Integer>(); /** * Return the boundary of the word following the current boundary. * <p> * Note: This iterator skips OmegaT-specific tags, and groups * [text-]mnemonics-text into a single token. * * @return The character index of the next text boundary or DONE if all * boundaries have been returned. Equivalent to next(1). */ public int next() { if (!nextItems.isEmpty()) { return nextItems.removeFirst(); } int curr = current(); int next = breaker.next(); if (DONE == next) { return DONE; } String str = text.substring(curr, next); // grouping OmegaT tags if (str.equals("<")) { int next2 = breaker.next(); if (DONE == next2) { return next; } int next3 = breaker.next(); if (DONE == next3) { nextItems.add(next2); return next; } // there're at least two maybe-words after "<" String str2 = text.substring(next, next2); String str3 = text.substring(next2, next3); if (str2.equals("/")) { // maybe closing tag if (!PatternConsts.OMEGAT_TAG_ONLY.matcher(str3).matches()) { // rewind back two times breaker.previous(); breaker.previous(); return next; } int next4 = breaker.next(); if (DONE == next4) { nextItems.add(next2); nextItems.add(next3); return next; } // there're at least three maybe-words after "<" String str4 = text.substring(next3, next4); if (str4.equals(">")) { return next4; // yes, it's a standalone tag } else { // rewind back three times breaker.previous(); breaker.previous(); breaker.previous(); return next; } } else if (!PatternConsts.OMEGAT_TAG_ONLY.matcher(str2).matches()) { // rewind back two times breaker.previous(); breaker.previous(); return next; } if (str3.equals("/")) { // maybe standalone tag int next4 = breaker.next(); if (DONE == next4) { nextItems.add(next2); nextItems.add(next3); return next; } // there're at least three maybe-words after "<" String str4 = text.substring(next3, next4); if (str4.equals(">")) { return next4; // yes, it's a standalone tag } else { // rewind back three times breaker.previous(); breaker.previous(); breaker.previous(); return next; } } else if (str3.equals(">")) { return next3; // yes, it's an OmegaT tag } { // rewind back two times breaker.previous(); breaker.previous(); return next; } } else if (str.equals("&")) { // trying to see the mnemonic int next2 = breaker.next(); if (DONE == next2) { return next; } String str2 = text.substring(next, next2); if (Character.isLetterOrDigit(str2.codePointAt(0))) { return next2; } else { // rewind back once breaker.previous(); return next; } } else if (Character.isLetterOrDigit(str.codePointAt(0))) { // trying to see whether the next "word" is a "&" int next2 = breaker.next(); if (DONE == next2) { return next; } String str2 = text.substring(next, next2); if (str2.equals("&")) { // yes, it's there int next3 = breaker.next(); if (DONE == next3) { // Something& nextItems.add(next2); return next; } String str3 = text.substring(next2, next3); // is it followed by a word like Some&thing if (Character.isLetterOrDigit(str3.codePointAt(0))) { return next3; // oh yes } else { // oh no // rewind back two times breaker.previous(); breaker.previous(); return next; } } else { // rewind back once breaker.previous(); return next; } } else { return next; } } // //////////////////////////////////////////////////////////////////////// // Not yet implemented // //////////////////////////////////////////////////////////////////////// /** * <b>Not yet implemented! Throws a RuntimeException if you try to call * it.</b> * * Return the nth boundary from the current boundary * * @param n * which boundary to return. A value of 0 does nothing. Negative * values move to previous boundaries and positive values move to * later boundaries. * @return The index of the nth boundary from the current position. */ public int next(int n) { throw new RuntimeException("Not Implemented"); } /** * <b>Not yet implemented! Throws a RuntimeException if you try to call * it.</b> * * Return the first boundary following the specified offset. The value * returned is always greater than the offset or the value * BreakIterator.DONE * * @param offset * the offset to begin scanning. Valid values are determined by * the CharacterIterator passed to setText(). Invalid values * cause an IllegalArgumentException to be thrown. * @return The first boundary after the specified offset. */ public int following(int offset) { throw new RuntimeException("Not Implemented"); } /** * <b>Not yet implemented! Throws a RuntimeException if you try to call * it.</b> * * Set a new text for scanning. The current scan position is reset to * first(). * * @param newText * new text to scan. */ public void setText(java.text.CharacterIterator newText) { throw new RuntimeException("Not Implemented"); } /** * <b>Not yet implemented! Throws a RuntimeException if you try to call * it.</b> * * Get the text being scanned * * @return the text being scanned */ public java.text.CharacterIterator getText() { throw new RuntimeException("Not Implemented"); } /** * <b>Not yet implemented! Throws a RuntimeException if you try to call * it.</b> * * Return the boundary preceding the current boundary. * * @return The character index of the previous text boundary or DONE if all * boundaries have been returned. */ public int previous() { throw new RuntimeException("Not Implemented"); } /** * <b>Not yet implemented! Throws a RuntimeException if you try to call * it.</b> * * Return the last boundary. The iterator's current position is set to the * last boundary. * * @return The character index of the last text boundary. */ public int last() { throw new RuntimeException("Not Implemented"); } }