/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.facet.taxonomy.writercache; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.Iterator; import org.apache.lucene.facet.taxonomy.FacetLabel; /** * This is a very efficient LabelToOrdinal implementation that uses a * CharBlockArray to store all labels and a configurable number of HashArrays to * reference the labels. * <p> * Since the HashArrays don't handle collisions, a {@link CollisionMap} is used * to store the colliding labels. * <p> * This data structure grows by adding a new HashArray whenever the number of * collisions in the {@link CollisionMap} exceeds {@code loadFactor} * * {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding * labels into the HashArrays to possibly reduce the number of collisions. * * For setting the {@code loadFactor} see * {@link #CompactLabelToOrdinal(int, float, int)}. * * <p> * This data structure has a much lower memory footprint (~30%) compared to a * Java HashMap<String, Integer>. It also only uses a small fraction of objects * a HashMap would use, thus limiting the GC overhead. Ingestion speed was also * ~50% faster compared to a HashMap for 3M unique labels. * * @lucene.experimental */ public class CompactLabelToOrdinal extends LabelToOrdinal { /** Default maximum load factor. */ public static final float DefaultLoadFactor = 0.15f; static final char TERMINATOR_CHAR = 0xffff; private static final int COLLISION = -5; private HashArray[] hashArrays; private CollisionMap collisionMap; private CharBlockArray labelRepository; private int capacity; private int threshold; private float loadFactor; /** How many labels. */ public int sizeOfMap() { return this.collisionMap.size(); } private CompactLabelToOrdinal() { } /** Sole constructor. */ public CompactLabelToOrdinal(int initialCapacity, float loadFactor, int numHashArrays) { this.hashArrays = new HashArray[numHashArrays]; this.capacity = determineCapacity((int) Math.pow(2, numHashArrays), initialCapacity); init(); this.collisionMap = new CollisionMap(this.labelRepository); this.counter = 0; this.loadFactor = loadFactor; this.threshold = (int) (this.loadFactor * this.capacity); } static int determineCapacity(int minCapacity, int initialCapacity) { int capacity = minCapacity; while (capacity < initialCapacity) { capacity <<= 1; } return capacity; } private void init() { labelRepository = new CharBlockArray(); CategoryPathUtils.serialize(new FacetLabel(), labelRepository); int c = this.capacity; for (int i = 0; i < this.hashArrays.length; i++) { this.hashArrays[i] = new HashArray(c); c /= 2; } } @Override public void addLabel(FacetLabel label, int ordinal) { if (collisionMap.size() > threshold) { grow(); } int hash = CompactLabelToOrdinal.stringHashCode(label); for (int i = 0; i < this.hashArrays.length; i++) { if (addLabel(this.hashArrays[i], label, hash, ordinal)) { return; } } int prevVal = collisionMap.addLabel(label, hash, ordinal); if (prevVal != ordinal) { throw new IllegalArgumentException("Label already exists: " + label + " prev ordinal " + prevVal); } } @Override public int getOrdinal(FacetLabel label) { if (label == null) { return LabelToOrdinal.INVALID_ORDINAL; } int hash = CompactLabelToOrdinal.stringHashCode(label); for (int i = 0; i < this.hashArrays.length; i++) { int ord = getOrdinal(this.hashArrays[i], label, hash); if (ord != COLLISION) { return ord; } } return this.collisionMap.get(label, hash); } private void grow() { HashArray temp = this.hashArrays[this.hashArrays.length - 1]; for (int i = this.hashArrays.length - 1; i > 0; i--) { this.hashArrays[i] = this.hashArrays[i - 1]; } this.capacity *= 2; this.hashArrays[0] = new HashArray(this.capacity); for (int i = 1; i < this.hashArrays.length; i++) { int[] sourceOffsetArray = this.hashArrays[i].offsets; int[] sourceCidsArray = this.hashArrays[i].cids; for (int k = 0; k < sourceOffsetArray.length; k++) { for (int j = 0; j < i && sourceOffsetArray[k] != 0; j++) { int[] targetOffsetArray = this.hashArrays[j].offsets; int[] targetCidsArray = this.hashArrays[j].cids; int newIndex = indexFor(stringHashCode( this.labelRepository, sourceOffsetArray[k]), targetOffsetArray.length); if (targetOffsetArray[newIndex] == 0) { targetOffsetArray[newIndex] = sourceOffsetArray[k]; targetCidsArray[newIndex] = sourceCidsArray[k]; sourceOffsetArray[k] = 0; } } } } for (int i = 0; i < temp.offsets.length; i++) { int offset = temp.offsets[i]; if (offset > 0) { int hash = stringHashCode(this.labelRepository, offset); addLabelOffset(hash, temp.cids[i], offset); } } CollisionMap oldCollisionMap = this.collisionMap; this.collisionMap = new CollisionMap(oldCollisionMap.capacity(), this.labelRepository); this.threshold = (int) (this.capacity * this.loadFactor); Iterator<CollisionMap.Entry> it = oldCollisionMap.entryIterator(); while (it.hasNext()) { CollisionMap.Entry e = it.next(); addLabelOffset(stringHashCode(this.labelRepository, e.offset), e.cid, e.offset); } } private boolean addLabel(HashArray a, FacetLabel label, int hash, int ordinal) { int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length); int offset = a.offsets[index]; if (offset == 0) { a.offsets[index] = this.labelRepository.length(); CategoryPathUtils.serialize(label, labelRepository); a.cids[index] = ordinal; return true; } return false; } private void addLabelOffset(int hash, int cid, int knownOffset) { for (int i = 0; i < this.hashArrays.length; i++) { if (addLabelOffsetToHashArray(this.hashArrays[i], hash, cid, knownOffset)) { return; } } this.collisionMap.addLabelOffset(hash, knownOffset, cid); if (this.collisionMap.size() > this.threshold) { grow(); } } private boolean addLabelOffsetToHashArray(HashArray a, int hash, int ordinal, int knownOffset) { int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length); int offset = a.offsets[index]; if (offset == 0) { a.offsets[index] = knownOffset; a.cids[index] = ordinal; return true; } return false; } private int getOrdinal(HashArray a, FacetLabel label, int hash) { if (label == null) { return LabelToOrdinal.INVALID_ORDINAL; } int index = indexFor(hash, a.offsets.length); int offset = a.offsets[index]; if (offset == 0) { return LabelToOrdinal.INVALID_ORDINAL; } if (CategoryPathUtils.equalsToSerialized(label, labelRepository, offset)) { return a.cids[index]; } return COLLISION; } /** Returns index for hash code h. */ static int indexFor(int h, int length) { return h & (length - 1); } // static int stringHashCode(String label) { // int len = label.length(); // int hash = 0; // int i; // for (i = 0; i < len; ++i) // hash = 33 * hash + label.charAt(i); // // hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); // hash = hash ^ (hash >>> 7) ^ (hash >>> 4); // // return hash; // // } static int stringHashCode(FacetLabel label) { int hash = label.hashCode(); hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); hash = hash ^ (hash >>> 7) ^ (hash >>> 4); return hash; } static int stringHashCode(CharBlockArray labelRepository, int offset) { int hash = CategoryPathUtils.hashCodeOfSerialized(labelRepository, offset); hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); hash = hash ^ (hash >>> 7) ^ (hash >>> 4); return hash; } // public static boolean equals(CharSequence label, CharBlockArray array, // int offset) { // // CONTINUE HERE // int len = label.length(); // int bi = array.blockIndex(offset); // CharBlockArray.Block b = array.blocks.get(bi); // int index = array.indexInBlock(offset); // // for (int i = 0; i < len; i++) { // if (label.charAt(i) != b.chars[index]) { // return false; // } // index++; // if (index == b.length) { // b = array.blocks.get(++bi); // index = 0; // } // } // // return b.chars[index] == TerminatorChar; // } /** * Returns an estimate of the amount of memory used by this table. Called only in * this package. Memory is consumed mainly by three structures: the hash arrays, * label repository and collision map. */ int getMemoryUsage() { int memoryUsage = 0; if (this.hashArrays != null) { // HashArray capacity is instance-specific. for (HashArray ha : this.hashArrays) { // Each has 2 capacity-length arrays of ints. memoryUsage += ( ha.capacity * 2 * 4 ) + 4; } } if (this.labelRepository != null) { // All blocks are the same size. int blockSize = this.labelRepository.blockSize; // Each block has room for blockSize UTF-16 chars. int actualBlockSize = ( blockSize * 2 ) + 4; memoryUsage += this.labelRepository.blocks.size() * actualBlockSize; memoryUsage += 8; // Two int values for array as a whole. } if (this.collisionMap != null) { memoryUsage += this.collisionMap.getMemoryUsage(); } return memoryUsage; } /** * Opens the file and reloads the CompactLabelToOrdinal. The file it expects * is generated from the {@link #flush(Path)} command. */ static CompactLabelToOrdinal open(Path file, float loadFactor, int numHashArrays) throws IOException { /** * Part of the file is the labelRepository, which needs to be rehashed * and label offsets re-added to the object. I am unsure as to why we * can't just store these off in the file as well, but in keeping with * the spirit of the original code, I did it this way. (ssuppe) */ CompactLabelToOrdinal l2o = new CompactLabelToOrdinal(); l2o.loadFactor = loadFactor; l2o.hashArrays = new HashArray[numHashArrays]; DataInputStream dis = null; try { dis = new DataInputStream(new BufferedInputStream( Files.newInputStream(file))); // TaxiReader needs to load the "counter" or occupancy (L2O) to know // the next unique facet. we used to load the delimiter too, but // never used it. l2o.counter = dis.readInt(); l2o.capacity = determineCapacity((int) Math.pow(2, l2o.hashArrays.length), l2o.counter); l2o.init(); // now read the chars l2o.labelRepository = CharBlockArray.open(dis); l2o.collisionMap = new CollisionMap(l2o.labelRepository); // Calculate hash on the fly based on how CategoryPath hashes // itself. Maybe in the future we can call some static based methods // in CategoryPath so that this doesn't break again? I don't like // having code in two different places... int cid = 0; // Skip the initial offset, it's the CategoryPath(0,0), which isn't // a hashed value. int offset = 1; int lastStartOffset = offset; // This loop really relies on a well-formed input (assumes pretty blindly // that array offsets will work). Since the initial file is machine // generated, I think this should be OK. while (offset < l2o.labelRepository.length()) { // identical code to CategoryPath.hashFromSerialized. since we need to // advance offset, we cannot call the method directly. perhaps if we // could pass a mutable Integer or something... int length = (short) l2o.labelRepository.charAt(offset++); int hash = length; if (length != 0) { for (int i = 0; i < length; i++) { int len = (short) l2o.labelRepository.charAt(offset++); hash = hash * 31 + l2o.labelRepository.subSequence(offset, offset + len).hashCode(); offset += len; } } // Now that we've hashed the components of the label, do the // final part of the hash algorithm. hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); hash = hash ^ (hash >>> 7) ^ (hash >>> 4); // Add the label, and let's keep going l2o.addLabelOffset(hash, cid, lastStartOffset); cid++; lastStartOffset = offset; } } catch (ClassNotFoundException cnfe) { throw new IOException("Invalid file format. Cannot deserialize."); } finally { if (dis != null) { dis.close(); } } l2o.threshold = (int) (l2o.loadFactor * l2o.capacity); return l2o; } void flush(Path file) throws IOException { OutputStream fos = Files.newOutputStream(file); try { BufferedOutputStream os = new BufferedOutputStream(fos); DataOutputStream dos = new DataOutputStream(os); dos.writeInt(this.counter); // write the labelRepository this.labelRepository.flush(dos); // Closes the data output stream dos.close(); } finally { fos.close(); } } private static final class HashArray { int[] offsets; int[] cids; int capacity; HashArray(int c) { this.capacity = c; this.offsets = new int[this.capacity]; this.cids = new int[this.capacity]; } } }