package hadoop.extensions;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.IOException;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class WebTableRecordReader extends
RecordReader<Text, BytesWritable> {
private boolean isFinished = false;
private FSDataInputStream fin;
private GzipCompressorInputStream gzIn;
private TarArchiveInputStream tarIn;
private BytesWritable currentValue;
private Text currentKey;
private Path filePath;
@Override
public void close() throws IOException {
try {
tarIn.close();
gzIn.close();
fin.close();
} catch (Exception ignore) {
}
}
@Override
public Text getCurrentKey() throws IOException,
InterruptedException {
return currentKey;
}
@Override
public BytesWritable getCurrentValue() throws IOException,
InterruptedException {
return currentValue;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return isFinished ? 1 : 0;
}
@Override
public void initialize(InputSplit arg0, TaskAttemptContext arg1)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) arg0;
Configuration conf = arg1.getConfiguration();
filePath = split.getPath();
FileSystem fs = filePath.getFileSystem(conf);
fin = fs.open(filePath);
gzIn = new GzipCompressorInputStream(fin);
tarIn = new TarArchiveInputStream(gzIn);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
TarArchiveEntry entry = null;
boolean ret = false;
while (true) {
entry = tarIn.getNextTarEntry();
if (entry == null) {
isFinished = true;
ret = false;
break;
}
String key = entry.getName();
if (key.endsWith(".csv")) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] temp = new byte[8192];
while (true) {
int bytesRead = 0;
try {
bytesRead = tarIn.read(temp, 0, 8192);
} catch (EOFException e) {
if (WebTableInputFormat.getLenient() == false) {
throw e;
}
return false;
}
if (bytesRead > 0) {
bos.write(temp, 0, bytesRead);
} else {
break;
}
}
bos.flush();
ret = true;
currentValue = new BytesWritable(bos.toByteArray());
currentKey = new Text(filePath.getName()+"/"+entry.getName());
break;
}
}
return ret;
}
}