package org.archive.wayback.hadoop;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.zip.GZIPInputStream;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.archive.wayback.util.ByteOp;
public class GZIPRangeLineDereferencingRecordReader extends LineDereferencingRecordReader{
String curInputLine = null;
FSDataInputStream fsdis = null;
long curStart = 0;
byte[] buffer = null;
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(key == null) {
key = new Text();
}
if(value == null) {
value = new Text();
}
while(true) {
if(curReader == null) {
// are there more?
if(internal.nextKeyValue()) {
progress = internal.getProgress();
curInputLine = internal.getCurrentValue().toString();
String[] parts = curInputLine.split(" ");
if(parts.length != 3) {
throw new IOException("Bad format line(" + curInputLine +")");
}
String newFile = parts[0];
if(fsdis != null) {
if(!newFile.equals(curFile)) {
// close old and open new, otherwise we can just
// do another read on the current one:
fsdis.close();
curFile = newFile;
Path path = new Path(curFile);
fsdis = fileSystem.open(path);
}
} else {
curFile = newFile;
Path path = new Path(curFile);
fsdis = fileSystem.open(path);
}
curFile = parts[0];
curStart = Long.parseLong(parts[1]);
int length = Integer.parseInt(parts[2]);
if(buffer == null) {
buffer = new byte[length];
} else if (buffer.length < length) {
buffer = new byte[length];
}
fsdis.read(curStart,buffer,0,length);
// the whole chunk is now in buffer:
InputStream is =
new GZIPInputStream(new ByteArrayInputStream(buffer,0,length));
curReader = new BufferedReader(new InputStreamReader(is,ByteOp.UTF8));
curLine = 0;
} else {
// all done:
return false;
}
}
// try to read another line:
String nextLine = curReader.readLine();
if(nextLine != null) {
key.set(curFile+":"+curStart+":"+curLine);
value.set(nextLine);
curLine++;
return true;
}
curReader.close();
curReader = null;
}
}
}