/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.util.automaton; import java.nio.charset.StandardCharsets; import java.util.HashSet; import java.util.Random; import java.util.Set; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.Util; public class TestUTF32ToUTF8 extends LuceneTestCase { @Override public void setUp() throws Exception { super.setUp(); } private static final int MAX_UNICODE = 0x10FFFF; private boolean matches(ByteRunAutomaton a, int code) { char[] chars = Character.toChars(code); byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)]; final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b); return a.run(b, 0, len); } private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) { // Verify correct ints are accepted final int nonSurrogateCount; final boolean ovSurStart; if (endCode < UnicodeUtil.UNI_SUR_HIGH_START || startCode > UnicodeUtil.UNI_SUR_LOW_END) { // no overlap w/ surrogates nonSurrogateCount = endCode - startCode + 1; ovSurStart = false; } else if (isSurrogate(startCode)) { // start of range overlaps surrogates nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1); ovSurStart = false; } else if (isSurrogate(endCode)) { // end of range overlaps surrogates ovSurStart = true; nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1); } else { // range completely subsumes surrogates ovSurStart = true; nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1); } assert nonSurrogateCount > 0; for(int iter=0;iter<iters;iter++) { // pick random code point in-range int code = startCode + r.nextInt(nonSurrogateCount); if (isSurrogate(code)) { if (ovSurStart) { code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START); } else { code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode); } } assert code >= startCode && code <= endCode: "code=" + code + " start=" + startCode + " end=" + endCode; assert !isSurrogate(code); assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code, matches(a, code)); } // Verify invalid ints are not accepted final int invalidRange = MAX_UNICODE - (endCode - startCode + 1); if (invalidRange > 0) { for(int iter=0;iter<iters;iter++) { int x = TestUtil.nextInt(r, 0, invalidRange - 1); final int code; if (x >= startCode) { code = endCode + 1 + x - startCode; } else { code = x; } if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { iter--; continue; } assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code, matches(a, code)); } } } // Evenly picks random code point from the 4 "buckets" // (bucket = same #bytes when encoded to utf8) private int getCodeStart(Random r) { switch(r.nextInt(4)) { case 0: return TestUtil.nextInt(r, 0, 128); case 1: return TestUtil.nextInt(r, 128, 2048); case 2: return TestUtil.nextInt(r, 2048, 65536); default: return TestUtil.nextInt(r, 65536, 1 + MAX_UNICODE); } } private static boolean isSurrogate(int code) { return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END; } public void testRandomRanges() throws Exception { final Random r = random(); int ITERS = atLeast(10); int ITERS_PER_DFA = atLeast(100); for(int iter=0;iter<ITERS;iter++) { int x1 = getCodeStart(r); int x2 = getCodeStart(r); final int startCode, endCode; if (x1 < x2) { startCode = x1; endCode = x2; } else { startCode = x2; endCode = x1; } if (isSurrogate(startCode) && isSurrogate(endCode)) { iter--; continue; } Automaton a = Automata.makeCharRange(startCode, endCode); testOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA); } } public void testSpecialCase() { RegExp re = new RegExp(".?"); Automaton automaton = re.toAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); // make sure character dfa accepts empty string assertTrue(cra.isAccept(0)); assertTrue(cra.run("")); assertTrue(cra.run(new char[0], 0, 0)); // make sure byte dfa accepts empty string assertTrue(bra.isAccept(0)); assertTrue(bra.run(new byte[0], 0, 0)); } public void testSpecialCase2() throws Exception { RegExp re = new RegExp(".+\u0775"); String input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775"; Automaton automaton = re.toAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); assertTrue(cra.run(input)); byte[] bytes = input.getBytes(StandardCharsets.UTF_8); assertTrue(bra.run(bytes, 0, bytes.length)); // this one fails! } public void testSpecialCase3() throws Exception { RegExp re = new RegExp("(\\鯺)*(.)*\\Ӕ"); String input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4"; Automaton automaton = re.toAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); assertTrue(cra.run(input)); byte[] bytes = input.getBytes(StandardCharsets.UTF_8); assertTrue(bra.run(bytes, 0, bytes.length)); } public void testRandomRegexes() throws Exception { int num = atLeast(250); for (int i = 0; i < num; i++) { assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton()); } } public void testSingleton() throws Exception { int iters = atLeast(100); for(int iter=0;iter<iters;iter++) { String s = TestUtil.randomRealisticUnicodeString(random()); Automaton a = Automata.makeString(s); Automaton utf8 = new UTF32ToUTF8().convert(a); IntsRefBuilder ints = new IntsRefBuilder(); Util.toIntsRef(new BytesRef(s), ints); Set<IntsRef> set = new HashSet<>(); set.add(ints.get()); assertEquals(set, TestOperations.getFiniteStrings(utf8)); } } private void assertAutomaton(Automaton automaton) throws Exception { CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = atLeast(1000); for (int i = 0; i < num; i++) { final String string; if (random().nextBoolean()) { // likely not accepted string = TestUtil.randomUnicodeString(random()); } else { // will be accepted int[] codepoints = ras.getRandomAcceptedString(random()); try { string = UnicodeUtil.newString(codepoints, 0, codepoints.length); } catch (Exception e) { System.out.println(codepoints.length + " codepoints:"); for(int j=0;j<codepoints.length;j++) { System.out.println(" " + Integer.toHexString(codepoints[j])); } throw e; } } byte bytes[] = string.getBytes(StandardCharsets.UTF_8); assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length)); } } }