/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Throwables;
import com.google.common.collect.Iterables;
import com.indeed.imhotep.io.LimitedBufferedOutputStream;
import com.indeed.imhotep.io.TempFileSizeLimitExceededException;
import com.indeed.imhotep.io.WriteLimitExceededException;
import com.indeed.util.core.Throwables2;
import com.indeed.util.core.hash.MurmurHash;
import com.indeed.imhotep.api.FTGSIterator;
import com.indeed.imhotep.api.RawFTGSIterator;
import com.indeed.imhotep.service.FTGSOutputStreamWriter;
import com.indeed.util.core.io.Closeables2;
import org.apache.log4j.Logger;
import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
/**
* @author jplaisance
*/
public final class FTGSSplitter implements Runnable, Closeable {
private static final Logger log = Logger.getLogger(FTGSSplitter.class);
private final FTGSIterator iterator;
private final int numSplits;
private final FTGSOutputStreamWriter[] outputs;
private final File[] files;
private final OutputStream[] outputStreams;
private final RawFTGSIterator[] ftgsIterators;
private final AtomicBoolean done = new AtomicBoolean(false);
private final Thread runThread;
private final int numStats;
private final int largePrime;
public FTGSSplitter(FTGSIterator ftgsIterator, final int numSplits, final int numStats, final String threadNameSuffix, final int largePrime, final AtomicLong tempFileSizeBytesLeft) throws IOException {
this.iterator = ftgsIterator;
this.numSplits = numSplits;
this.numStats = numStats;
this.largePrime = largePrime;
outputs = new FTGSOutputStreamWriter[numSplits];
files = new File[numSplits];
outputStreams = new OutputStream[numSplits];
ftgsIterators = new RawFTGSIterator[numSplits];
final AtomicInteger doneCounter = new AtomicInteger();
runThread = new Thread(this, "FTGSSplitterThread-"+threadNameSuffix);
runThread.setDaemon(true);
try {
for (int i = 0; i < numSplits; i++) {
files[i] = File.createTempFile("ftgsSplitter", ".tmp");
outputStreams[i] = new LimitedBufferedOutputStream(new FileOutputStream(files[i]), tempFileSizeBytesLeft, 65536);
outputs[i] = new FTGSOutputStreamWriter(outputStreams[i]);
ftgsIterators[i] = new SplitterRawFTGSIterator(i, numStats, doneCounter, numSplits);
}
} catch (Throwable t) {
try {
close();
} finally {
throw Throwables2.propagate(t, IOException.class);
}
}
runThread.start();
}
public RawFTGSIterator[] getFtgsIterators() {
return ftgsIterators;
}
public void run() {
try {
final RawFTGSIterator rawIterator;
if (iterator instanceof RawFTGSIterator) {
rawIterator = (RawFTGSIterator) iterator;
} else {
rawIterator = null;
}
final long[] statBuf = new long[numStats];
while (iterator.nextField()) {
final boolean fieldIsIntType = iterator.fieldIsIntType();
for (final FTGSOutputStreamWriter output : outputs) {
output.switchField(iterator.fieldName(), fieldIsIntType);
}
while (iterator.nextTerm()) {
final FTGSOutputStreamWriter output;
final int split;
if (fieldIsIntType) {
final long term = iterator.termIntVal();
split = (int)((term*largePrime+12345 & Integer.MAX_VALUE) >> 16) % numSplits;
output = outputs[split];
output.switchIntTerm(term, iterator.termDocFreq());
} else {
if (rawIterator != null) {
split = hashStringTerm(rawIterator.termStringBytes(), rawIterator.termStringLength());
output = outputs[split];
output.switchBytesTerm(rawIterator.termStringBytes(), rawIterator.termStringLength(), rawIterator.termDocFreq());
} else {
final byte[] termStringBytes = iterator.termStringVal().getBytes(Charsets.UTF_8);
split = hashStringTerm(termStringBytes, termStringBytes.length);
output = outputs[split];
output.switchBytesTerm(termStringBytes, termStringBytes.length, iterator.termDocFreq());
}
}
while (iterator.nextGroup()) {
output.switchGroup(iterator.group());
iterator.groupStats(statBuf);
for (long stat : statBuf) {
output.addStat(stat);
}
}
}
}
for (final FTGSOutputStreamWriter output : outputs) {
output.close();
}
} catch (Throwable t) {
close();
if(t instanceof WriteLimitExceededException) {
throw new TempFileSizeLimitExceededException(t);
}
throw Throwables.propagate(t);
} finally {
Closeables2.closeQuietly(iterator, log);
}
}
private int hashStringTerm(byte[] termStringBytes, int termStringLength) {
return ((MurmurHash.hash32(termStringBytes, 0, termStringLength)*largePrime+12345 & 0x7FFFFFFF) >> 16) % numSplits;
}
@Override
public void close() {
if (done.compareAndSet(false, true)) {
try {
if (Thread.currentThread() != runThread) {
while (true) {
try {
runThread.interrupt();
runThread.join(10);
if (!runThread.isAlive()) break;
} catch (InterruptedException e) {
//ignore
}
}
}
} finally {
Closeables2.closeAll(log, iterator, Closeables2.forIterable(log, Iterables.transform(Arrays.asList(files), new Function<File, Closeable>() {
public Closeable apply(final File input) {
return new Closeable() {
public void close() throws IOException {
input.delete();
}
};
}
})), Closeables2.forArray(log, outputs), Closeables2.forArray(log, ftgsIterators), Closeables2.forArray(log, outputStreams));
}
}
}
public boolean isClosed() {
return done.get();
}
private class SplitterRawFTGSIterator implements RawFTGSIterator {
private final InputStreamFTGSIterator delegate;
private boolean initialized = false;
public SplitterRawFTGSIterator(int splitIndex, int numStats, final AtomicInteger doneCounter, final int numSplits) throws FileNotFoundException {
delegate = new InputStreamFTGSIterator(new BufferedInputStream(new FileInputStream(files[splitIndex]), 65536), numStats) {
boolean closed = false;
@Override
public void close() {
if (!closed) {
closed = true;
super.close();
if (doneCounter.incrementAndGet() == numSplits) {
FTGSSplitter.this.close();
}
}
}
};
files[splitIndex].delete();
}
private InputStreamFTGSIterator getDelegate() {
if (!initialized) {
try {
runThread.join();
} catch (InterruptedException e) {
throw Throwables.propagate(e);
}
initialized = true;
}
return delegate;
}
@Override
public boolean nextField() {
return getDelegate().nextField();
}
@Override
public String fieldName() {
return getDelegate().fieldName();
}
@Override
public boolean fieldIsIntType() {
return getDelegate().fieldIsIntType();
}
@Override
public boolean nextTerm() {
return getDelegate().nextTerm();
}
@Override
public long termDocFreq() {
return getDelegate().termDocFreq();
}
@Override
public long termIntVal() {
return getDelegate().termIntVal();
}
@Override
public String termStringVal() {
return getDelegate().termStringVal();
}
@Override
public byte[] termStringBytes() {
return getDelegate().termStringBytes();
}
@Override
public int termStringLength() {
return getDelegate().termStringLength();
}
@Override
public boolean nextGroup() {
return getDelegate().nextGroup();
}
@Override
public int group() {
return getDelegate().group();
}
@Override
public void groupStats(final long[] stats) {
getDelegate().groupStats(stats);
}
@Override
public void close() {
delegate.close();
}
}
}