/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.service;
import com.google.common.base.Charsets;
import com.indeed.imhotep.api.FTGSIterator;
import com.indeed.imhotep.api.RawFTGSIterator;
import com.indeed.util.io.VIntUtils;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
public final class FTGSOutputStreamWriter implements Closeable {
private final OutputStream out;
private boolean fieldIsIntType;
private byte[] previousTermBytes = new byte[100];
private int previousTermLength;
private byte[] currentTermBytes = new byte[100];
private int currentTermLength;
private long previousTermInt;
private long currentTermInt;
private long currentTermDocFreq;
private boolean termWritten;
private boolean fieldWritten = false;
private int previousGroupId = -1;
public FTGSOutputStreamWriter(final OutputStream out) {
this.out = out;
}
public void switchField(String field, boolean isIntType) throws IOException {
endField();
fieldIsIntType = isIntType;
startField(fieldIsIntType, field, out);
fieldWritten = true;
previousTermLength = 0;
previousTermInt = -1;
}
public void switchBytesTerm(byte[] termBytes, int termLength, long termDocFreq) throws IOException {
endTerm();
currentTermBytes = copyInto(termBytes, termLength, currentTermBytes);
currentTermLength = termLength;
currentTermDocFreq = termDocFreq;
}
public void switchIntTerm(long term, long termDocFreq) throws IOException {
endTerm();
currentTermInt = term;
currentTermDocFreq = termDocFreq;
}
public void switchGroup(int groupId) throws IOException {
if (!termWritten) {
writeTerm();
}
writeVLong(groupId - previousGroupId, out);
previousGroupId = groupId;
}
private void writeTerm() throws IOException {
if (fieldIsIntType) {
if (previousTermInt == -1 && currentTermInt == previousTermInt) {
//still decodes to 0 but allows reader to distinguish between end of field and delta of zero
out.write(0x80);
out.write(0);
} else {
writeVLong(currentTermInt - previousTermInt, out);
}
previousTermInt = currentTermInt;
} else {
final int pLen = prefixLen(previousTermBytes, currentTermBytes, Math.min(previousTermLength, currentTermLength));
writeVLong((previousTermLength - pLen) + 1, out);
writeVLong(currentTermLength - pLen, out);
out.write(currentTermBytes, pLen, currentTermLength - pLen);
previousTermBytes = copyInto(currentTermBytes, currentTermLength, previousTermBytes);
previousTermLength = currentTermLength;
}
writeSVLong(currentTermDocFreq, out);
termWritten = true;
}
public void addStat(long stat) throws IOException {
writeSVLong(stat, out);
}
public void close() throws IOException {
endField();
out.write(0);
out.flush();
}
private void endField() throws IOException {
if (!fieldWritten) return;
endTerm();
out.write(0);
if (!fieldIsIntType) out.write(0);
}
private void endTerm() throws IOException {
if (termWritten) {
out.write(0);
}
termWritten = false;
previousGroupId = -1;
}
public static void write(final FTGSIterator buffer, final int numStats, final OutputStream out) throws IOException {
final FTGSOutputStreamWriter writer = new FTGSOutputStreamWriter(out);
writer.write(buffer, numStats);
}
public void write(FTGSIterator buffer, int numStats) throws IOException {
final long[] stats = new long[numStats];
if (buffer instanceof RawFTGSIterator) {
final RawFTGSIterator rawBuffer = (RawFTGSIterator)buffer;
while (rawBuffer.nextField()) {
final boolean fieldIsIntType = rawBuffer.fieldIsIntType();
switchField(rawBuffer.fieldName(), fieldIsIntType);
while (rawBuffer.nextTerm()) {
if (fieldIsIntType) {
switchIntTerm(rawBuffer.termIntVal(), rawBuffer.termDocFreq());
} else {
// termStringBytes() returns a reference so this copies the bytes instead of hanging on to it
switchBytesTerm(rawBuffer.termStringBytes(), rawBuffer.termStringLength(), rawBuffer.termDocFreq());
}
while (rawBuffer.nextGroup()){
switchGroup(rawBuffer.group());
rawBuffer.groupStats(stats);
for (long stat : stats) {
addStat(stat);
}
}
endTerm();
}
}
} else {
while (buffer.nextField()) {
final boolean fieldIsIntType = buffer.fieldIsIntType();
switchField(buffer.fieldName(), fieldIsIntType);
while (buffer.nextTerm()) {
if (fieldIsIntType) {
switchIntTerm(buffer.termIntVal(), buffer.termDocFreq());
} else {
final byte[] bytes = buffer.termStringVal().getBytes(Charsets.UTF_8);
switchBytesTerm(bytes, bytes.length, buffer.termDocFreq());
}
while (buffer.nextGroup()){
switchGroup(buffer.group());
buffer.groupStats(stats);
for (long stat : stats) {
addStat(stat);
}
}
endTerm();
}
}
}
close();
}
private static void writeVLong(long i, final OutputStream out) throws IOException {
VIntUtils.writeVInt64(out, i);
}
private static void writeSVLong(long i, final OutputStream out) throws IOException {
VIntUtils.writeSVInt64(out, i);
}
private static void startField(boolean isIntType, String field, final OutputStream out) throws IOException {
if (isIntType) {
out.write(1);
} else {
out.write(2);
}
byte[] fieldBytes = field.getBytes(Charsets.UTF_8);
writeVLong(fieldBytes.length, out);
out.write(fieldBytes);
}
private static byte[] copyInto(final byte[] src, final int srcLen, byte[] dest) {
dest = ensureCap(dest, srcLen);
System.arraycopy(src, 0, dest, 0, srcLen);
return dest;
}
private static byte[] ensureCap(final byte[] b, final int len) {
if (b == null) return new byte[Math.max(len, 16)];
if (b.length >= len) return b;
return new byte[Math.max(b.length*2, len)];
}
private static int prefixLen(final byte[] a, final byte[] b, final int max) {
for (int i = 0; i < max; i++) {
if (a[i] != b[i]) return i;
}
return max;
}
}