/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.imhotep; import com.google.common.base.Charsets; import com.indeed.imhotep.api.FTGSIterator; import com.indeed.imhotep.api.RawFTGSIterator; import javax.annotation.Nullable; import java.io.Closeable; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.CharsetDecoder; import java.util.Arrays; import java.util.Collection; /** * @author jsgroth */ public final class RawFTGSMerger extends AbstractFTGSMerger implements RawFTGSIterator { private final CharsetDecoder decoder = Charsets.UTF_8.newDecoder(); private final RawFTGSIterator[] rawIteratorRefs; private byte[] currentTermBytes; private ByteBuffer byteBuffer; private int currentTermLength; private String termStringVal; public RawFTGSMerger(Collection<? extends RawFTGSIterator> iterators, int numStats, @Nullable Closeable doneCallback) { super(iterators, numStats, doneCallback); rawIteratorRefs = iterators.toArray(new RawFTGSIterator[iterators.size()]); currentTermBytes = new byte[100]; byteBuffer = ByteBuffer.wrap(currentTermBytes); currentTermLength = 0; termStringVal = null; } @Override public String termStringVal() { if (termStringVal == null) { try { termStringVal = decoder.decode((ByteBuffer)byteBuffer.position(0).limit(currentTermLength)).toString(); } catch (CharacterCodingException e) { throw new RuntimeException(e); } } return termStringVal; } @Override public byte[] termStringBytes() { return currentTermBytes; } @Override public int termStringLength() { return currentTermLength; } @Override public boolean nextTerm() { for (int i = 0; i < numTermIterators; ++i) { final FTGSIterator itr = iterators[termIterators[i]]; if (!itr.nextTerm()) { final int fi = termIteratorIndexes[i]; swap(fieldIterators, fi, --numFieldIterators); for (int j = 0; j < numTermIterators; ++j) { if (termIteratorIndexes[j] == numFieldIterators) { termIteratorIndexes[j] = fi; } } } } numTermIterators = 0; if (numFieldIterators == 0) return false; int newNumTermIterators = 0; if (fieldIsIntType) { long min = Long.MAX_VALUE; for (int i = 0; i < numFieldIterators; ++i) { final FTGSIterator itr = iterators[fieldIterators[i]]; final long term = itr.termIntVal(); if (term < min) { newNumTermIterators = 1; termIteratorIndexes[0] = i; min = term; } else if (term == min) { termIteratorIndexes[newNumTermIterators++] = i; } } termIntVal = min; } else { byte[] minTermBytes = null; int minTermLength = -1; for (int i = 0; i < numFieldIterators; ++i) { final RawFTGSIterator itr = rawIteratorRefs[fieldIterators[i]]; final byte[] termBytes = itr.termStringBytes(); final int termLength = itr.termStringLength(); final int c; if (minTermBytes == null || (c = compareBytes(termBytes, termLength, minTermBytes, minTermLength)) < 0) { newNumTermIterators = 1; termIteratorIndexes[0] = i; minTermBytes = termBytes; minTermLength = termLength; } else if (c == 0) { termIteratorIndexes[newNumTermIterators++] = i; } } if (currentTermBytes.length < minTermLength) { currentTermBytes = Arrays.copyOf(minTermBytes, Math.max(minTermLength, 2 * currentTermBytes.length)); byteBuffer = ByteBuffer.wrap(currentTermBytes); } else { System.arraycopy(minTermBytes, 0, currentTermBytes, 0, minTermLength); } currentTermLength = minTermLength; termStringVal = null; } for (int i = 0; i < newNumTermIterators; ++i) { final int fi = termIteratorIndexes[i]; final int index = fieldIterators[fi]; termIterators[numTermIterators] = index; termIteratorIndexes[numTermIterators++] = fi; } termIteratorsRemaining = numTermIterators; for (int i = 0; i < termIteratorsRemaining; ++i) { final FTGSIterator itr = iterators[termIterators[i]]; if (!itr.nextGroup()) { swap(termIterators, i, --termIteratorsRemaining); swap(termIteratorIndexes, i, termIteratorsRemaining); --i; } } accumulatedVec.reset(); return true; } // this is a comparison of UTF-8 bytes that is wrong in the same way String.compareTo(String) is wrong public static int compareBytes(final byte[] b1, final int l1, final byte[] b2, final int l2) { for (int i = 0, j = 0; i < l1 && j < l2; i++, j++) { final int v1 = b1[i] & 0xFF; final int v2 = b2[j] & 0xFF; if (v1 != v2) { if (((v1 & 0xF0) == 0xF0 || (v2 & 0xF0) == 0xF0) && ((v1 & 0xF0) != (v2 & 0xF0))) { if ((v1 & 0xF0) == 0xF0) return UTF8ToCodePoint(v2, b2, j + 1, l2) > 0xDFFF ? -1 : 1; return UTF8ToCodePoint(v1, b1, i + 1, l1) > 0xDFFF ? 1 : -1; } return v1 - v2; } } return l1 - l2; } static int UTF8ToCodePoint(final int firstByte, final byte[] b, int off, final int len) { if (firstByte < 128) return firstByte; int cp; final int cpl; if ((firstByte & 0x20) == 0) { cpl = 1; cp = (firstByte & 0x1F) << 6; } else if ((firstByte & 0x10) == 0) { cpl = 2; cp = (firstByte & 0x0F) << 12; } else if ((firstByte & 0x08) == 0) { cpl = 3; cp = (firstByte & 0x07) << 18; } else if ((firstByte & 0x04) == 0) { cpl = 4; cp = (firstByte & 0x03) << 24; } else if ((firstByte & 0x02) == 0) { cpl = 5; cp = (firstByte & 0x01) << 30; } else { throw new RuntimeException("invalid UTF-8"); } if (off + cpl > len) throw new RuntimeException("invalid UTF-8"); for (int k = cpl - 1; k >= 0; --k) { cp |= ((b[off++] & 0x3F) << (6*k)); } return cp; } }