/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.util.automaton; import java.util.Arrays; import java.util.ArrayList; import java.util.List; // TODO // - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef /** * Converts UTF-32 automata to the equivalent UTF-8 representation. * @lucene.internal */ public final class UTF32ToUTF8 { // Unicode boundaries for UTF8 bytes 1,2,3,4 private static final int[] startCodes = new int[] {0, 128, 2048, 65536}; private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111}; static int[] MASKS = new int[32]; static { int v = 2; for(int i=0;i<32;i++) { MASKS[i] = v-1; v *= 2; } } // Represents one of the N utf8 bytes that (in sequence) // define a code point. value is the byte value; bits is // how many bits are "used" by utf8 at that byte private static class UTF8Byte { int value; // TODO: change to byte byte bits; } // Holds a single code point, as a sequence of 1-4 utf8 bytes: // TODO: maybe move to UnicodeUtil? private static class UTF8Sequence { private final UTF8Byte[] bytes; private int len; public UTF8Sequence() { bytes = new UTF8Byte[4]; for(int i=0;i<4;i++) { bytes[i] = new UTF8Byte(); } } public int byteAt(int idx) { return bytes[idx].value; } public int numBits(int idx) { return bytes[idx].bits; } private void set(int code) { if (code < 128) { // 0xxxxxxx bytes[0].value = code; bytes[0].bits = 7; len = 1; } else if (code < 2048) { // 110yyyxx 10xxxxxx bytes[0].value = (6 << 5) | (code >> 6); bytes[0].bits = 5; setRest(code, 1); len = 2; } else if (code < 65536) { // 1110yyyy 10yyyyxx 10xxxxxx bytes[0].value = (14 << 4) | (code >> 12); bytes[0].bits = 4; setRest(code, 2); len = 3; } else { // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx bytes[0].value = (30 << 3) | (code >> 18); bytes[0].bits = 3; setRest(code, 3); len = 4; } } private void setRest(int code, int numBytes) { for(int i=0;i<numBytes;i++) { bytes[numBytes-i].value = 128 | (code & MASKS[5]); bytes[numBytes-i].bits = 6; code = code >> 6; } } @Override public String toString() { StringBuilder b = new StringBuilder(); for(int i=0;i<len;i++) { if (i > 0) { b.append(' '); } b.append(Integer.toBinaryString(bytes[i].value)); } return b.toString(); } } /** Sole constructor. */ public UTF32ToUTF8() { } private final UTF8Sequence startUTF8 = new UTF8Sequence(); private final UTF8Sequence endUTF8 = new UTF8Sequence(); private final UTF8Sequence tmpUTF8a = new UTF8Sequence(); private final UTF8Sequence tmpUTF8b = new UTF8Sequence(); // Builds necessary utf8 edges between start & end void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) { startUTF8.set(startCodePoint); endUTF8.set(endCodePoint); build(start, end, startUTF8, endUTF8, 0); } private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { // Break into start, middle, end: if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) { // Degen case: lead with the same byte: if (upto == startUTF8.len-1 && upto == endUTF8.len-1) { // Super degen: just single edge, one UTF8 byte: utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto)); return; } else { assert startUTF8.len > upto+1; assert endUTF8.len > upto+1; int n = utf8.createState(); // Single value leading edge utf8.addTransition(start, n, startUTF8.byteAt(upto)); //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single // Recurse for the rest build(n, end, startUTF8, endUTF8, 1+upto); } } else if (startUTF8.len == endUTF8.len) { if (upto == startUTF8.len-1) { //start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto)); } else { start(start, end, startUTF8, upto, false); if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { // There is a middle all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1); } end(start, end, endUTF8, upto, false); } } else { // start start(start, end, startUTF8, upto, true); // possibly middle, spanning multiple num bytes int byteCount = 1+startUTF8.len-upto; final int limit = endUTF8.len-upto; while (byteCount < limit) { // wasteful: we only need first byte, and, we should // statically encode this first byte: tmpUTF8a.set(startCodes[byteCount-1]); tmpUTF8b.set(endCodes[byteCount-1]); all(start, end, tmpUTF8a.byteAt(0), tmpUTF8b.byteAt(0), tmpUTF8a.len - 1); byteCount++; } // end end(start, end, endUTF8, upto, true); } } private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) { if (upto == startUTF8.len-1) { // Done recursing utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start } else { int n = utf8.createState(); utf8.addTransition(start, n, startUTF8.byteAt(upto)); //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start start(n, end, startUTF8, 1+upto, true); int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]; if (doAll && startUTF8.byteAt(upto) != endCode) { all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1); } } } private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) { if (upto == endUTF8.len-1) { // Done recursing //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto)); } else { final int startCode; if (endUTF8.numBits(upto) == 5) { // special case -- avoid created unused edges (endUTF8 // doesn't accept certain byte sequences) -- there // are other cases we could optimize too: startCode = 194; } else { startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]); } if (doAll && endUTF8.byteAt(upto) != startCode) { all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1); } int n = utf8.createState(); //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end utf8.addTransition(start, n, endUTF8.byteAt(upto)); end(n, end, endUTF8, 1+upto, true); } } private void all(int start, int end, int startCode, int endCode, int left) { if (left == 0) { //start.addTransition(new Transition(startCode, endCode, end)); // type=all utf8.addTransition(start, end, startCode, endCode); } else { int lastN = utf8.createState(); //start.addTransition(new Transition(startCode, endCode, lastN)); // type=all utf8.addTransition(start, lastN, startCode, endCode); while (left > 1) { int n = utf8.createState(); //lastN.addTransition(new Transition(128, 191, n)); // type=all* utf8.addTransition(lastN, n, 128, 191); // type=all* left--; lastN = n; } //lastN.addTransition(new Transition(128, 191, end)); // type = all* utf8.addTransition(lastN, end, 128, 191); // type = all* } } Automaton.Builder utf8; /** Converts an incoming utf32 automaton to an equivalent * utf8 one. The incoming automaton need not be * deterministic. Note that the returned automaton will * not in general be deterministic, so you must * determinize it if that's needed. */ public Automaton convert(Automaton utf32) { if (utf32.getNumStates() == 0) { return utf32; } int[] map = new int[utf32.getNumStates()]; Arrays.fill(map, -1); List<Integer> pending = new ArrayList<>(); int utf32State = 0; pending.add(utf32State); utf8 = new Automaton.Builder(); int utf8State = utf8.createState(); utf8.setAccept(utf8State, utf32.isAccept(utf32State)); map[utf32State] = utf8State; Transition scratch = new Transition(); while (pending.size() != 0) { utf32State = pending.remove(pending.size()-1); utf8State = map[utf32State]; assert utf8State != -1; int numTransitions = utf32.getNumTransitions(utf32State); utf32.initTransition(utf32State, scratch); for(int i=0;i<numTransitions;i++) { utf32.getNextTransition(scratch); int destUTF32 = scratch.dest; int destUTF8 = map[destUTF32]; if (destUTF8 == -1) { destUTF8 = utf8.createState(); utf8.setAccept(destUTF8, utf32.isAccept(destUTF32)); map[destUTF32] = destUTF8; pending.add(destUTF32); } // Writes new transitions into pendingTransitions: convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max); } } return utf8.finish(); } }