/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.util; import java.io.IOException; import java.io.StringReader; import java.util.Locale; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.TestUtil; /** * Testcase for {@link CharTokenizer} subclasses */ public class TestCharTokenizers extends BaseTokenStreamTestCase { /* * test to read surrogate pairs without loosing the pairing * if the surrogate pair is at the border of the internal IO buffer */ public void testReadSupplementaryChars() throws IOException { StringBuilder builder = new StringBuilder(); // create random input int num = 1024 + random().nextInt(1024); num *= RANDOM_MULTIPLIER; for (int i = 1; i < num; i++) { builder.append("\ud801\udc1cabc"); if((i % 10) == 0) builder.append(" "); } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" ")); } /* * test to extend the buffer TermAttribute buffer internally. If the internal * alg that extends the size of the char array only extends by 1 char and the * next char to be filled in is a supplementary codepoint (using 2 chars) an * index out of bound exception is triggered. */ public void testExtendCharBuffer() throws IOException { for (int i = 0; i < 40; i++) { StringBuilder builder = new StringBuilder(); for (int j = 0; j < 1+i; j++) { builder.append("a"); } builder.append("\ud801\udc1cabc"); Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)}); } } /* * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens */ public void testMaxWordLength() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 255; i++) { builder.append("A"); } Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); } /* * tests the max word length of 255 with a surrogate pair at position 255 */ public void testMaxWordLengthWithSupplementary() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 254; i++) { builder.append("A"); } builder.append("\ud801\udc1c"); Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); } // LUCENE-3642: normalize SMP->BMP and check that offsets are correct public void testCrossPlaneNormalization() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) { @Override protected int normalize(int c) { if (c > 0xffff) { return 'δ'; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = TestUtil.randomUnicodeString(random()); try (TokenStream ts = analyzer.tokenStream("foo", s)) { ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); } } // just for fun checkRandomData(random(), analyzer, num); analyzer.close(); } // LUCENE-3642: normalize BMP->SMP and check that offsets are correct public void testCrossPlaneNormalization2() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) { @Override protected int normalize(int c) { if (c <= 0xffff) { return 0x1043C; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = TestUtil.randomUnicodeString(random()); try (TokenStream ts = analyzer.tokenStream("foo", s)) { ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); } } // just for fun checkRandomData(random(), analyzer, num); analyzer.close(); } public void testDefinitionUsingMethodReference1() throws Exception { final StringReader reader = new StringReader("Tokenizer Test"); final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace); tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test" }); } public void testDefinitionUsingMethodReference2() throws Exception { final StringReader reader = new StringReader("Tokenizer(Test)"); final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toUpperCase); tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { "TOKENIZER", "TEST" }); } public void testDefinitionUsingLambda() throws Exception { final StringReader reader = new StringReader("Tokenizer\u00A0Test Foo"); final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c), Character::toLowerCase); tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test", "foo" }); } }