package org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Random; import org.apache.lucene.util.Version; import org.apache.lucene.util._TestUtil; /** * Testcase for {@link CharTokenizer} subclasses */ public class TestCharTokenizers extends BaseTokenStreamTestCase { /* * test to read surrogate pairs without loosing the pairing * if the surrogate pair is at the border of the internal IO buffer */ public void testReadSupplementaryChars() throws IOException { StringBuilder builder = new StringBuilder(); Random newRandom = newRandom(); // create random input int num = 1024 + newRandom.nextInt(1024); num *= RANDOM_MULTIPLIER; for (int i = 1; i < num; i++) { builder.append("\ud801\udc1cabc"); if((i % 10) == 0) builder.append(" "); } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); } /* * test to extend the buffer TermAttribute buffer internally. If the internal * alg that extends the size of the char array only extends by 1 char and the * next char to be filled in is a supplementary codepoint (using 2 chars) an * index out of bound exception is triggered. */ public void testExtendCharBuffer() throws IOException { for (int i = 0; i < 40; i++) { StringBuilder builder = new StringBuilder(); for (int j = 0; j < 1+i; j++) { builder.append("a"); } builder.append("\ud801\udc1cabc"); MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); } } /* * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens */ public void testMaxWordLength() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 255; i++) { builder.append("A"); } MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } /* * tests the max word length of 255 with a surrogate pair at position 255 */ public void testMaxWordLengthWithSupplementary() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 254; i++) { builder.append("A"); } builder.append("\ud801\udc1c"); MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } public void testIsTokenCharCharInSubclass() { new TestingCharTokenizer(Version.LUCENE_30, new StringReader("")); try { new TestingCharTokenizer(TEST_VERSION_CURRENT, new StringReader("")); fail("version 3.1 is not permitted if char based method is implemented"); } catch (IllegalArgumentException e) { // expected } } public void testNormalizeCharInSubclass() { new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader("")); try { new TestingCharTokenizerNormalize(TEST_VERSION_CURRENT, new StringReader("")); fail("version 3.1 is not permitted if char based method is implemented"); } catch (IllegalArgumentException e) { // expected } } public void testNormalizeAndIsTokenCharCharInSubclass() { new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30, new StringReader("")); try { new TestingCharTokenizerNormalizeIsTokenChar(TEST_VERSION_CURRENT, new StringReader("")); fail("version 3.1 is not permitted if char based method is implemented"); } catch (IllegalArgumentException e) { // expected } } static final class TestingCharTokenizer extends CharTokenizer { public TestingCharTokenizer(Version matchVersion, Reader input) { super(matchVersion, input); } @Override protected boolean isTokenChar(int c) { return Character.isLetter(c); } @Deprecated @Override protected boolean isTokenChar(char c) { return Character.isLetter(c); } } static final class TestingCharTokenizerNormalize extends CharTokenizer { public TestingCharTokenizerNormalize(Version matchVersion, Reader input) { super(matchVersion, input); } @Deprecated @Override protected char normalize(char c) { return c; } @Override protected int normalize(int c) { return c; } } static final class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer { public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion, Reader input) { super(matchVersion, input); } @Deprecated @Override protected char normalize(char c) { return c; } @Override protected int normalize(int c) { return c; } @Override protected boolean isTokenChar(int c) { return Character.isLetter(c); } @Deprecated @Override protected boolean isTokenChar(char c) { return Character.isLetter(c); } } }