package org.apache.lucene.util; import java.io.IOException; import java.io.Reader; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * {@link CharacterUtils} provides a unified interface to Character-related * operations to implement backwards compatible character operations based on a * {@link Version} instance. * * @lucene.internal */ public abstract class CharacterUtils { private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils(); private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils(); /** * Returns a {@link CharacterUtils} implementation according to the given * {@link Version} instance. * * @param matchVersion * a version instance * @return a {@link CharacterUtils} implementation according to the given * {@link Version} instance. */ public static CharacterUtils getInstance(final Version matchVersion) { return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4; } /** * Returns the code point at the given index of the char array. * Depending on the {@link Version} passed to * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior * of {@link Character#codePointAt(char[], int)} as it would have been * available on a Java 1.4 JVM or on a later virtual machine version. * * @param chars * a character array * @param offset * the offset to the char values in the chars array to be converted * * @return the Unicode code point at the given index * @throws NullPointerException * - if the array is null. * @throws IndexOutOfBoundsException * - if the value offset is negative or not less than the length of * the char array. */ public abstract int codePointAt(final char[] chars, final int offset); /** * Returns the code point at the given index of the {@link CharSequence}. * Depending on the {@link Version} passed to * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior * of {@link Character#codePointAt(char[], int)} as it would have been * available on a Java 1.4 JVM or on a later virtual machine version. * * @param seq * a character sequence * @param offset * the offset to the char values in the chars array to be converted * * @return the Unicode code point at the given index * @throws NullPointerException * - if the sequence is null. * @throws IndexOutOfBoundsException * - if the value offset is negative or not less than the length of * the character sequence. */ public abstract int codePointAt(final CharSequence seq, final int offset); /** * Returns the code point at the given index of the char array where only elements * with index less than the limit are used. * Depending on the {@link Version} passed to * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior * of {@link Character#codePointAt(char[], int)} as it would have been * available on a Java 1.4 JVM or on a later virtual machine version. * * @param chars * a character array * @param offset * the offset to the char values in the chars array to be converted * @param limit the index afer the last element that should be used to calculate * codepoint. * * @return the Unicode code point at the given index * @throws NullPointerException * - if the array is null. * @throws IndexOutOfBoundsException * - if the value offset is negative or not less than the length of * the char array. */ public abstract int codePointAt(final char[] chars, final int offset, final int limit); /** * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code> * of the given bufferSize. * * @param bufferSize * the internal char buffer size, must be <code>>= 2</code> * @return a new {@link CharacterBuffer} instance. */ public static CharacterBuffer newCharacterBuffer(final int bufferSize) { if(bufferSize < 2) throw new IllegalArgumentException("buffersize must be >= 2"); return new CharacterBuffer(new char[bufferSize], 0, 0); } /** * Fills the {@link CharacterBuffer} with characters read from the given * reader {@link Reader}. This method tries to read as many characters into * the {@link CharacterBuffer} as possible, each call to fill will start * filling the buffer from offset <code>0</code> up to the length of the size * of the internal character array. * <p> * Depending on the {@link Version} passed to * {@link CharacterUtils#getInstance(Version)} this method implements * supplementary character awareness when filling the given buffer. For all * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees * that the given {@link CharacterBuffer} will never contain a high surrogate * character as the last element in the buffer unless it is the last available * character in the reader. In other words, high and low surrogate pairs will * always be preserved across buffer boarders. * </p> * * @param buffer * the buffer to fill. * @param reader * the reader to read characters from. * @return <code>true</code> if and only if no more characters are available * in the reader, otherwise <code>false</code>. * @throws IOException * if the reader throws an {@link IOException}. */ public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException; private static final class Java5CharacterUtils extends CharacterUtils { Java5CharacterUtils() { } @Override public final int codePointAt(final char[] chars, final int offset) { return Character.codePointAt(chars, offset); } @Override public int codePointAt(final CharSequence seq, final int offset) { return Character.codePointAt(seq, offset); } @Override public int codePointAt(final char[] chars, final int offset, final int limit) { return Character.codePointAt(chars, offset, limit); } @Override public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { final char[] charBuffer = buffer.buffer; buffer.offset = 0; charBuffer[0] = buffer.lastTrailingHighSurrogate; final int offset = buffer.lastTrailingHighSurrogate == 0 ? 0 : 1; buffer.lastTrailingHighSurrogate = 0; final int read = reader.read(charBuffer, offset, charBuffer.length - offset); if (read == -1) { buffer.length = offset; return offset != 0; } buffer.length = read + offset; // special case if the read returns 0 and the lastTrailingHighSurrogate was set if (buffer.length > 1 && Character.isHighSurrogate(charBuffer[buffer.length - 1])) { buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; } return true; } } private static final class Java4CharacterUtils extends CharacterUtils { Java4CharacterUtils() { } @Override public final int codePointAt(final char[] chars, final int offset) { return chars[offset]; } @Override public int codePointAt(final CharSequence seq, final int offset) { return seq.charAt(offset); } @Override public int codePointAt(final char[] chars, final int offset, final int limit) { if(offset >= limit) throw new IndexOutOfBoundsException("offset must be less than limit"); return chars[offset]; } @Override public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { buffer.offset = 0; final int read = reader.read(buffer.buffer); if(read == -1) return false; buffer.length = read; return true; } } /** * A simple IO buffer to use with * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. */ public static final class CharacterBuffer { private final char[] buffer; private int offset; private int length; private char lastTrailingHighSurrogate = 0; CharacterBuffer(char[] buffer, int offset, int length) { this.buffer = buffer; this.offset = offset; this.length = length; } /** * Returns the internal buffer * * @return the buffer */ public char[] getBuffer() { return buffer; } /** * Returns the data offset in the internal buffer. * * @return the offset */ public int getOffset() { return offset; } /** * Return the length of the data in the internal buffer starting at * {@link #getOffset()} * * @return the length */ public int getLength() { return length; } /** * Resets the CharacterBuffer. All internals are reset to its default * values. */ public void reset() { offset = 0; length = 0; lastTrailingHighSurrogate = 0; } } }