/* * Copyright (c) 1996, 2006, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved * * The original version of this source code and documentation * is copyrighted and owned by Taligent, Inc., a wholly-owned * subsidiary of IBM. These materials are provided under terms * of a License Agreement between Taligent and Sun. This technology * is protected by multiple US and International patents. * * This notice and attribution to Taligent may not be removed. * Taligent is a registered trademark of Taligent, Inc. * */ package java.text; import java.lang.ref.SoftReference; import java.net.URL; import java.io.InputStream; import java.io.IOException; import java.security.AccessController; import java.security.PrivilegedAction; import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.text.spi.BreakIteratorProvider; import java.util.Locale; import java.util.MissingResourceException; import java.util.ResourceBundle; import java.util.spi.LocaleServiceProvider; import sun.util.LocaleServiceProviderPool; import sun.util.resources.LocaleData; /** {@collect.stats} * The <code>BreakIterator</code> class implements methods for finding * the location of boundaries in text. Instances of <code>BreakIterator</code> * maintain a current position and scan over text * returning the index of characters where boundaries occur. * Internally, <code>BreakIterator</code> scans text using a * <code>CharacterIterator</code>, and is thus able to scan text held * by any object implementing that protocol. A <code>StringCharacterIterator</code> * is used to scan <code>String</code> objects passed to <code>setText</code>. * * <p> * You use the factory methods provided by this class to create * instances of various types of break iterators. In particular, * use <code>getWordInstance</code>, <code>getLineInstance</code>, * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code> * to create <code>BreakIterator</code>s that perform * word, line, sentence, and character boundary analysis respectively. * A single <code>BreakIterator</code> can work only on one unit * (word, line, sentence, and so on). You must use a different iterator * for each unit boundary analysis you wish to perform. * * <p><a name="line"></a> * Line boundary analysis determines where a text string can be * broken when line-wrapping. The mechanism correctly handles * punctuation and hyphenated words. Actual line breaking needs * to also consider the available line width and is handled by * higher-level software. * * <p><a name="sentence"></a> * Sentence boundary analysis allows selection with correct interpretation * of periods within numbers and abbreviations, and trailing punctuation * marks such as quotation marks and parentheses. * * <p><a name="word"></a> * Word boundary analysis is used by search and replace functions, as * well as within text editing applications that allow the user to * select words with a double click. Word selection provides correct * interpretation of punctuation marks within and following * words. Characters that are not part of a word, such as symbols * or punctuation marks, have word-breaks on both sides. * * <p><a name="character"></a> * Character boundary analysis allows users to interact with characters * as they expect to, for example, when moving the cursor through a text * string. Character boundary analysis provides correct navigation * through character strings, regardless of how the character is stored. * The boundaries returned may be those of supplementary characters, * combining character sequences, or ligature clusters. * For example, an accented character might be stored as a base character * and a diacritical mark. What users consider to be a character can * differ between languages. * * <p> * The <code>BreakIterator</code> instances returned by the factory methods * of this class are intended for use with natural languages only, not for * programming language text. It is however possible to define subclasses * that tokenize a programming language. * * <P> * <strong>Examples</strong>:<P> * Creating and using text boundaries: * <blockquote> * <pre> * public static void main(String args[]) { * if (args.length == 1) { * String stringToExamine = args[0]; * //print each word in order * BreakIterator boundary = BreakIterator.getWordInstance(); * boundary.setText(stringToExamine); * printEachForward(boundary, stringToExamine); * //print each sentence in reverse order * boundary = BreakIterator.getSentenceInstance(Locale.US); * boundary.setText(stringToExamine); * printEachBackward(boundary, stringToExamine); * printFirst(boundary, stringToExamine); * printLast(boundary, stringToExamine); * } * } * </pre> * </blockquote> * * Print each element in order: * <blockquote> * <pre> * public static void printEachForward(BreakIterator boundary, String source) { * int start = boundary.first(); * for (int end = boundary.next(); * end != BreakIterator.DONE; * start = end, end = boundary.next()) { * System.out.println(source.substring(start,end)); * } * } * </pre> * </blockquote> * * Print each element in reverse order: * <blockquote> * <pre> * public static void printEachBackward(BreakIterator boundary, String source) { * int end = boundary.last(); * for (int start = boundary.previous(); * start != BreakIterator.DONE; * end = start, start = boundary.previous()) { * System.out.println(source.substring(start,end)); * } * } * </pre> * </blockquote> * * Print first element: * <blockquote> * <pre> * public static void printFirst(BreakIterator boundary, String source) { * int start = boundary.first(); * int end = boundary.next(); * System.out.println(source.substring(start,end)); * } * </pre> * </blockquote> * * Print last element: * <blockquote> * <pre> * public static void printLast(BreakIterator boundary, String source) { * int end = boundary.last(); * int start = boundary.previous(); * System.out.println(source.substring(start,end)); * } * </pre> * </blockquote> * * Print the element at a specified position: * <blockquote> * <pre> * public static void printAt(BreakIterator boundary, int pos, String source) { * int end = boundary.following(pos); * int start = boundary.previous(); * System.out.println(source.substring(start,end)); * } * </pre> * </blockquote> * * Find the next word: * <blockquote> * <pre> * public static int nextWordStartAfter(int pos, String text) { * BreakIterator wb = BreakIterator.getWordInstance(); * wb.setText(text); * int last = wb.following(pos); * int current = wb.next(); * while (current != BreakIterator.DONE) { * for (int p = last; p < current; p++) { * if (Character.isLetter(text.codePointAt(p))) * return last; * } * last = current; * current = wb.next(); * } * return BreakIterator.DONE; * } * </pre> * (The iterator returned by BreakIterator.getWordInstance() is unique in that * the break positions it returns don't represent both the start and end of the * thing being iterated over. That is, a sentence-break iterator returns breaks * that each represent the end of one sentence and the beginning of the next. * With the word-break iterator, the characters between two boundaries might be a * word, or they might be the punctuation or whitespace between two words. The * above code uses a simple heuristic to determine which boundary is the beginning * of a word: If the characters between this boundary and the next boundary * include at least one letter (this can be an alphabetical letter, a CJK ideograph, * a Hangul syllable, a Kana character, etc.), then the text between this boundary * and the next is a word; otherwise, it's the material between words.) * </blockquote> * * @see CharacterIterator * */ public abstract class BreakIterator implements Cloneable { /** {@collect.stats} * Constructor. BreakIterator is stateless and has no default behavior. */ protected BreakIterator() { } /** {@collect.stats} * Create a copy of this iterator * @return A copy of this */ public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { throw new InternalError(); } } /** {@collect.stats} * DONE is returned by previous(), next(), next(int), preceding(int) * and following(int) when either the first or last text boundary has been * reached. */ public static final int DONE = -1; /** {@collect.stats} * Returns the first boundary. The iterator's current position is set * to the first text boundary. * @return The character index of the first text boundary. */ public abstract int first(); /** {@collect.stats} * Returns the last boundary. The iterator's current position is set * to the last text boundary. * @return The character index of the last text boundary. */ public abstract int last(); /** {@collect.stats} * Returns the nth boundary from the current boundary. If either * the first or last text boundary has been reached, it returns * <code>BreakIterator.DONE</code> and the current position is set to either * the first or last text boundary depending on which one is reached. Otherwise, * the iterator's current position is set to the new boundary. * For example, if the iterator's current position is the mth text boundary * and three more boundaries exist from the current boundary to the last text * boundary, the next(2) call will return m + 2. The new text position is set * to the (m + 2)th text boundary. A next(4) call would return * <code>BreakIterator.DONE</code> and the last text boundary would become the * new text position. * @param n which boundary to return. A value of 0 * does nothing. Negative values move to previous boundaries * and positive values move to later boundaries. * @return The character index of the nth boundary from the current position * or <code>BreakIterator.DONE</code> if either first or last text boundary * has been reached. */ public abstract int next(int n); /** {@collect.stats} * Returns the boundary following the current boundary. If the current boundary * is the last text boundary, it returns <code>BreakIterator.DONE</code> and * the iterator's current position is unchanged. Otherwise, the iterator's * current position is set to the boundary following the current boundary. * @return The character index of the next text boundary or * <code>BreakIterator.DONE</code> if the current boundary is the last text * boundary. * Equivalent to next(1). * @see #next(int) */ public abstract int next(); /** {@collect.stats} * Returns the boundary preceding the current boundary. If the current boundary * is the first text boundary, it returns <code>BreakIterator.DONE</code> and * the iterator's current position is unchanged. Otherwise, the iterator's * current position is set to the boundary preceding the current boundary. * @return The character index of the previous text boundary or * <code>BreakIterator.DONE</code> if the current boundary is the first text * boundary. */ public abstract int previous(); /** {@collect.stats} * Returns the first boundary following the specified character offset. If the * specified offset equals to the last text boundary, it returns * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged. * Otherwise, the iterator's current position is set to the returned boundary. * The value returned is always greater than the offset or the value * <code>BreakIterator.DONE</code>. * @param offset the character offset to begin scanning. * @return The first boundary after the specified offset or * <code>BreakIterator.DONE</code> if the last text boundary is passed in * as the offset. * @exception IllegalArgumentException if the specified offset is less than * the first text boundary or greater than the last text boundary. */ public abstract int following(int offset); /** {@collect.stats} * Returns the last boundary preceding the specified character offset. If the * specified offset equals to the first text boundary, it returns * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged. * Otherwise, the iterator's current position is set to the returned boundary. * The value returned is always less than the offset or the value * <code>BreakIterator.DONE</code>. * @param offset the characater offset to begin scanning. * @return The last boundary before the specified offset or * <code>BreakIterator.DONE</code> if the first text boundary is passed in * as the offset. * @exception IllegalArgumentException if the specified offset is less than * the first text boundary or greater than the last text boundary. * @since 1.2 */ public int preceding(int offset) { // NOTE: This implementation is here solely because we can't add new // abstract methods to an existing class. There is almost ALWAYS a // better, faster way to do this. int pos = following(offset); while (pos >= offset && pos != DONE) pos = previous(); return pos; } /** {@collect.stats} * Returns true if the specified character offset is a text boundary. * @param offset the character offset to check. * @return <code>true</code> if "offset" is a boundary position, * <code>false</code> otherwise. * @since 1.2 */ public boolean isBoundary(int offset) { // NOTE: This implementation probably is wrong for most situations // because it fails to take into account the possibility that a // CharacterIterator passed to setText() may not have a begin offset // of 0. But since the abstract BreakIterator doesn't have that // knowledge, it assumes the begin offset is 0. If you subclass // BreakIterator, copy the SimpleTextBoundary implementation of this // function into your subclass. [This should have been abstract at // this level, but it's too late to fix that now.] if (offset == 0) return true; else return following(offset - 1) == offset; } /** {@collect.stats} * Returns character index of the text boundary that was most * recently returned by next(), next(int), previous(), first(), last(), * following(int) or preceding(int). If any of these methods returns * <code>BreakIterator.DONE</code> because either first or last text boundary * has been reached, it returns the first or last text boundary depending on * which one is reached. * @return The text boundary returned from the above methods, first or last * text boundary. * @see #next() * @see #next(int) * @see #previous() * @see #first() * @see #last() * @see #following(int) * @see #preceding(int) */ public abstract int current(); /** {@collect.stats} * Get the text being scanned * @return the text being scanned */ public abstract CharacterIterator getText(); /** {@collect.stats} * Set a new text string to be scanned. The current scan * position is reset to first(). * @param newText new text to scan. */ public void setText(String newText) { setText(new StringCharacterIterator(newText)); } /** {@collect.stats} * Set a new text for scanning. The current scan * position is reset to first(). * @param newText new text to scan. */ public abstract void setText(CharacterIterator newText); private static final int CHARACTER_INDEX = 0; private static final int WORD_INDEX = 1; private static final int LINE_INDEX = 2; private static final int SENTENCE_INDEX = 3; private static final SoftReference[] iterCache = new SoftReference[4]; /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#word">word breaks</a> * for the {@linkplain Locale#getDefault() default locale}. * @return A break iterator for word breaks */ public static BreakIterator getWordInstance() { return getWordInstance(Locale.getDefault()); } /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#word">word breaks</a> * for the given locale. * @param locale the desired locale * @return A break iterator for word breaks * @exception NullPointerException if <code>locale</code> is null */ public static BreakIterator getWordInstance(Locale locale) { return getBreakInstance(locale, WORD_INDEX, "WordData", "WordDictionary"); } /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#line">line breaks</a> * for the {@linkplain Locale#getDefault() default locale}. * @return A break iterator for line breaks */ public static BreakIterator getLineInstance() { return getLineInstance(Locale.getDefault()); } /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#line">line breaks</a> * for the given locale. * @param locale the desired locale * @return A break iterator for line breaks * @exception NullPointerException if <code>locale</code> is null */ public static BreakIterator getLineInstance(Locale locale) { return getBreakInstance(locale, LINE_INDEX, "LineData", "LineDictionary"); } /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#character">character breaks</a> * for the {@linkplain Locale#getDefault() default locale}. * @return A break iterator for character breaks */ public static BreakIterator getCharacterInstance() { return getCharacterInstance(Locale.getDefault()); } /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#character">character breaks</a> * for the given locale. * @param locale the desired locale * @return A break iterator for character breaks * @exception NullPointerException if <code>locale</code> is null */ public static BreakIterator getCharacterInstance(Locale locale) { return getBreakInstance(locale, CHARACTER_INDEX, "CharacterData", "CharacterDictionary"); } /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#sentence">sentence breaks</a> * for the {@linkplain Locale#getDefault() default locale}. * @return A break iterator for sentence breaks */ public static BreakIterator getSentenceInstance() { return getSentenceInstance(Locale.getDefault()); } /** {@collect.stats} * Returns a new <code>BreakIterator</code> instance * for <a href="#sentence">sentence breaks</a> * for the given locale. * @param locale the desired locale * @return A break iterator for sentence breaks * @exception NullPointerException if <code>locale</code> is null */ public static BreakIterator getSentenceInstance(Locale locale) { return getBreakInstance(locale, SENTENCE_INDEX, "SentenceData", "SentenceDictionary"); } private static BreakIterator getBreakInstance(Locale locale, int type, String dataName, String dictionaryName) { if (iterCache[type] != null) { BreakIteratorCache cache = (BreakIteratorCache) iterCache[type].get(); if (cache != null) { if (cache.getLocale().equals(locale)) { return cache.createBreakInstance(); } } } BreakIterator result = createBreakInstance(locale, type, dataName, dictionaryName); BreakIteratorCache cache = new BreakIteratorCache(locale, result); iterCache[type] = new SoftReference(cache); return result; } private static ResourceBundle getBundle(final String baseName, final Locale locale) { return (ResourceBundle) AccessController.doPrivileged(new PrivilegedAction() { public Object run() { return ResourceBundle.getBundle(baseName, locale); } }); } private static BreakIterator createBreakInstance(Locale locale, int type, String dataName, String dictionaryName) { // Check whether a provider can provide an implementation that's closer // to the requested locale than what the Java runtime itself can provide. LocaleServiceProviderPool pool = LocaleServiceProviderPool.getPool(BreakIteratorProvider.class); if (pool.hasProviders()) { BreakIterator providersInstance = pool.getLocalizedObject( BreakIteratorGetter.INSTANCE, locale, type); if (providersInstance != null) { return providersInstance; } } ResourceBundle bundle = getBundle( "sun.text.resources.BreakIteratorInfo", locale); String[] classNames = bundle.getStringArray("BreakIteratorClasses"); String dataFile = bundle.getString(dataName); try { if (classNames[type].equals("RuleBasedBreakIterator")) { return new RuleBasedBreakIterator(dataFile); } else if (classNames[type].equals("DictionaryBasedBreakIterator")) { String dictionaryFile = bundle.getString(dictionaryName); return new DictionaryBasedBreakIterator(dataFile, dictionaryFile); } else { throw new IllegalArgumentException("Invalid break iterator class \"" + classNames[type] + "\""); } } catch (Exception e) { throw new InternalError(e.toString()); } } /** {@collect.stats} * Returns an array of all locales for which the * <code>get*Instance</code> methods of this class can return * localized instances. * The returned array represents the union of locales supported by the Java * runtime and by installed * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations. * It must contain at least a <code>Locale</code> * instance equal to {@link java.util.Locale#US Locale.US}. * * @return An array of locales for which localized * <code>BreakIterator</code> instances are available. */ public static synchronized Locale[] getAvailableLocales() { LocaleServiceProviderPool pool = LocaleServiceProviderPool.getPool(BreakIteratorProvider.class); return pool.getAvailableLocales(); } private static final class BreakIteratorCache { private BreakIterator iter; private Locale locale; BreakIteratorCache(Locale locale, BreakIterator iter) { this.locale = locale; this.iter = (BreakIterator) iter.clone(); } Locale getLocale() { return locale; } BreakIterator createBreakInstance() { return (BreakIterator) iter.clone(); } } static long getLong(byte[] buf, int offset) { long num = buf[offset]&0xFF; for (int i = 1; i < 8; i++) { num = num<<8 | (buf[offset+i]&0xFF); } return num; } static int getInt(byte[] buf, int offset) { int num = buf[offset]&0xFF; for (int i = 1; i < 4; i++) { num = num<<8 | (buf[offset+i]&0xFF); } return num; } static short getShort(byte[] buf, int offset) { short num = (short)(buf[offset]&0xFF); num = (short)(num<<8 | (buf[offset+1]&0xFF)); return num; } /** {@collect.stats} * Obtains a BreakIterator instance from a BreakIteratorProvider * implementation. */ private static class BreakIteratorGetter implements LocaleServiceProviderPool.LocalizedObjectGetter<BreakIteratorProvider, BreakIterator> { private static final BreakIteratorGetter INSTANCE = new BreakIteratorGetter(); public BreakIterator getObject(BreakIteratorProvider breakIteratorProvider, Locale locale, String key, Object... params) { assert params.length == 1; switch ((Integer)params[0]) { case CHARACTER_INDEX: return breakIteratorProvider.getCharacterInstance(locale); case WORD_INDEX: return breakIteratorProvider.getWordInstance(locale); case LINE_INDEX: return breakIteratorProvider.getLineInstance(locale); case SENTENCE_INDEX: return breakIteratorProvider.getSentenceInstance(locale); default: assert false : "should not happen"; } return null; } } }