/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.data.types.str.dict; /** * Utility class for handling tries (see {@link TrieNode}). * * @author Bastian Gloeckle */ public class TrieUtil { /** * Inspects a trie that is built by the given node and finds the {@link TerminalNode#getTerminalId()} of a specific * value. * * @param str * The value to be searched. * @param fromIndex * The number of leading characters to ignore in str. If this is > 0 then the corresponding characters are * not inspected in this method, but it pretends that the str array starts at the given index. * @param curNode * The node that starts the trie. * @return If >= 0, then it is the ID of {@link TerminalNode} that corresponds to the searched value. If < 0, then the * searched value is not contained in the trie. The result is then -(ip +1), where <code>ip</code> is the * insertion point at which the searched value should be inserted (-> ip = ID of next greater terminal id). */ public static long findIdOfValue(char[] str, int fromIndex, TrieNode<?> curNode) { if (curNode instanceof TerminalNode) { if (fromIndex == str.length) return ((TerminalNode) curNode).getTerminalId(); // we are at a terminal node but not all of our input string was matched -> string is not contained. // Insert Point would be the "ID of the terminal node we found +1" (as the terminal node is < than our string). return -(((TerminalNode) curNode).getTerminalId() + 2); } ParentNode parent = (ParentNode) curNode; // TODO #6 do not compare to all children, but do some sort of binary search. for (int i = 0; i < parent.getChildNodes().length; i++) { char[] subSeq = parent.getChildChars()[i]; int compareRes = compareChars(subSeq, str, fromIndex); if (compareRes == 0) { // matched whole subSeq, continue recursively. return findIdOfValue(str, fromIndex + subSeq.length, parent.getChildNodes()[i]); } if (compareRes > 0) { // as soon as the subSeq gets greater than the searched string, we can stop, as subSeq are sorted! // find insertion point of the searched string, as it is not contained in the dict. long insertionPoint; if (parent.getChildNodes()[i] instanceof TerminalNode) insertionPoint = ((TerminalNode) parent.getChildNodes()[i]).getTerminalId(); else insertionPoint = ((ParentNode) parent.getChildNodes()[i]).getMinId(); return -(insertionPoint + 1); } } // we get here only, if all child nodes chars were < than the searched string. // -> InsertionPoint would be "lastChild.getMaxId() + 1". long insertionPoint; int lastChild = parent.getChildNodes().length - 1; if (parent.getChildNodes()[lastChild] instanceof TerminalNode) insertionPoint = ((TerminalNode) parent.getChildNodes()[lastChild]).getTerminalId() + 1; else insertionPoint = ((ParentNode) parent.getChildNodes()[lastChild]).getMaxId() + 1; return -(insertionPoint + 1); } /** * Compare two character arrays with at most comparing array1.length number of characters. * * @param array1 * The first character array that will be inspected * @param array2 * The second character array that will be inspected * @param array2StartIdx * If the second array should not be inspected from the beginning, this denotes the offset to start at. * @return <code>0</code> if all values of array1 are at the beginning of array2 (= they are equal up to the length of * array1). Return value is positive if array1 is bigger than array2, the result value is then (eq+1) where * <code>eq</code> is the number of equal characters found before identifying a character that was greater in * array1. Return value is negative, if array1 is smaller than array2, the result value is then -(eq+1) where * <code>eq</code> is the number of equal characters found before identifying a character that was smaller in * array1. */ public static int compareChars(char[] array1, char[] array2, int array2StartIdx) { if (array1.length == 0 && array2.length - array2StartIdx > 0) return -1; for (int i = 0; i < array1.length; i++) { if (array2StartIdx + i >= array2.length) return i + 1; if (array1[i] != array2[array2StartIdx + i]) { if (array1[i] < array2[array2StartIdx + i]) return -(i + 1); return i + 1; } } return 0; } }