WordIterator.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.tokenizer;

import java.text.BreakIterator;
import java.util.LinkedList;

import org.omegat.util.PatternConsts;

/**
 * BreakIterator for word-breaks with OmegaT heuristics, based on an instance of
 * BreakIterator implementing word breaks.
 *
 * @see java.text.BreakIterator#getWordInstance
 * @author Maxym Mykhalchuk
 */
public class WordIterator extends BreakIterator {
    BreakIterator breaker;
    String text;

    /** Creates a new instance of OmegaT's own word BreakIterator */
    public WordIterator() {
        breaker = BreakIterator.getWordInstance();
    }

    /**
     * Set a new text string to be scanned. The current scan position is reset
     * to first().
     *
     * @param newText
     *            new text to scan.
     */
    public void setText(String newText) {
        text = newText;
        breaker.setText(newText);
        nextItems.clear();
    }

    /**
     * Return the first boundary. The iterator's current position is set to the
     * first boundary.
     *
     * @return The character index of the first text boundary.
     */
    public int first() {
        return breaker.first();
    }

    /**
     * Return character index of the text boundary that was most recently
     * returned by next(), previous(), first(), or last()
     *
     * @return The boundary most recently returned.
     */
    public int current() {
        return breaker.current();
    }

    LinkedList<Integer> nextItems = new LinkedList<Integer>();

    /**
     * Return the boundary of the word following the current boundary.
     * <p>
     * Note: This iterator skips OmegaT-specific tags, and groups
     * [text-]mnemonics-text into a single token.
     *
     * @return The character index of the next text boundary or DONE if all
     *         boundaries have been returned. Equivalent to next(1).
     */
    public int next() {
        if (!nextItems.isEmpty()) {
            return nextItems.removeFirst();
        }

        int curr = current();
        int next = breaker.next();
        if (DONE == next) {
            return DONE;
        }

        String str = text.substring(curr, next);

        // grouping OmegaT tags
        if (str.equals("<")) {
            int next2 = breaker.next();
            if (DONE == next2) {
                return next;
            }

            int next3 = breaker.next();
            if (DONE == next3) {
                nextItems.add(next2);
                return next;
            }
            // there're at least two maybe-words after "<"
            String str2 = text.substring(next, next2);
            String str3 = text.substring(next2, next3);

            if (str2.equals("/")) {
                // maybe closing tag
                if (!PatternConsts.OMEGAT_TAG_ONLY.matcher(str3).matches()) {
                    // rewind back two times
                    breaker.previous();
                    breaker.previous();
                    return next;
                }

                int next4 = breaker.next();
                if (DONE == next4) {
                    nextItems.add(next2);
                    nextItems.add(next3);
                    return next;
                }
                // there're at least three maybe-words after "<"
                String str4 = text.substring(next3, next4);
                if (str4.equals(">")) {
                    return next4; // yes, it's a standalone tag
                } else {
                    // rewind back three times
                    breaker.previous();
                    breaker.previous();
                    breaker.previous();
                    return next;
                }
            } else if (!PatternConsts.OMEGAT_TAG_ONLY.matcher(str2).matches()) {
                // rewind back two times
                breaker.previous();
                breaker.previous();
                return next;
            }

            if (str3.equals("/")) {
                // maybe standalone tag
                int next4 = breaker.next();
                if (DONE == next4) {
                    nextItems.add(next2);
                    nextItems.add(next3);
                    return next;
                }
                // there're at least three maybe-words after "<"
                String str4 = text.substring(next3, next4);
                if (str4.equals(">")) {
                    return next4; // yes, it's a standalone tag
                } else {
                    // rewind back three times
                    breaker.previous();
                    breaker.previous();
                    breaker.previous();
                    return next;
                }
            } else if (str3.equals(">")) {
                return next3; // yes, it's an OmegaT tag
            }
            {
                // rewind back two times
                breaker.previous();
                breaker.previous();
                return next;
            }
        } else if (str.equals("&")) {
            // trying to see the mnemonic
            int next2 = breaker.next();
            if (DONE == next2) {
                return next;
            }

            String str2 = text.substring(next, next2);
            if (Character.isLetterOrDigit(str2.codePointAt(0))) {
                return next2;
            } else {
                // rewind back once
                breaker.previous();
                return next;
            }
        } else if (Character.isLetterOrDigit(str.codePointAt(0))) {
            // trying to see whether the next "word" is a "&"
            int next2 = breaker.next();
            if (DONE == next2) {
                return next;
            }

            String str2 = text.substring(next, next2);
            if (str2.equals("&")) { // yes, it's there
                int next3 = breaker.next();
                if (DONE == next3) {
                    // Something&
                    nextItems.add(next2);
                    return next;
                }

                String str3 = text.substring(next2, next3);
                // is it followed by a word like Some&thing
                if (Character.isLetterOrDigit(str3.codePointAt(0))) {
                    return next3; // oh yes
                } else { // oh no
                    // rewind back two times
                    breaker.previous();
                    breaker.previous();
                    return next;
                }
            } else {
                // rewind back once
                breaker.previous();
                return next;
            }
        } else {
            return next;
        }
    }

    // ////////////////////////////////////////////////////////////////////////
    // Not yet implemented
    // ////////////////////////////////////////////////////////////////////////

    /**
     * <b>Not yet implemented! Throws a RuntimeException if you try to call
     * it.</b>
     *
     * Return the nth boundary from the current boundary
     *
     * @param n
     *            which boundary to return. A value of 0 does nothing. Negative
     *            values move to previous boundaries and positive values move to
     *            later boundaries.
     * @return The index of the nth boundary from the current position.
     */
    public int next(int n) {
        throw new RuntimeException("Not Implemented");
    }

    /**
     * <b>Not yet implemented! Throws a RuntimeException if you try to call
     * it.</b>
     *
     * Return the first boundary following the specified offset. The value
     * returned is always greater than the offset or the value
     * BreakIterator.DONE
     *
     * @param offset
     *            the offset to begin scanning. Valid values are determined by
     *            the CharacterIterator passed to setText(). Invalid values
     *            cause an IllegalArgumentException to be thrown.
     * @return The first boundary after the specified offset.
     */
    public int following(int offset) {
        throw new RuntimeException("Not Implemented");
    }

    /**
     * <b>Not yet implemented! Throws a RuntimeException if you try to call
     * it.</b>
     *
     * Set a new text for scanning. The current scan position is reset to
     * first().
     *
     * @param newText
     *            new text to scan.
     */
    public void setText(java.text.CharacterIterator newText) {
        throw new RuntimeException("Not Implemented");
    }

    /**
     * <b>Not yet implemented! Throws a RuntimeException if you try to call
     * it.</b>
     *
     * Get the text being scanned
     *
     * @return the text being scanned
     */
    public java.text.CharacterIterator getText() {
        throw new RuntimeException("Not Implemented");
    }

    /**
     * <b>Not yet implemented! Throws a RuntimeException if you try to call
     * it.</b>
     *
     * Return the boundary preceding the current boundary.
     *
     * @return The character index of the previous text boundary or DONE if all
     *         boundaries have been returned.
     */
    public int previous() {
        throw new RuntimeException("Not Implemented");
    }

    /**
     * <b>Not yet implemented! Throws a RuntimeException if you try to call
     * it.</b>
     *
     * Return the last boundary. The iterator's current position is set to the
     * last boundary.
     *
     * @return The character index of the last text boundary.
     */
    public int last() {
        throw new RuntimeException("Not Implemented");
    }

}