package org.archive.hadoop.cdx;
import java.io.IOException;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.archive.util.binsearch.SortedTextFile;
import org.archive.util.binsearch.impl.HDFSSeekableLineReaderFactory;
import org.archive.util.iterator.BoundedStringIterator;
import org.archive.util.iterator.CloseableIterator;
public class CDXCluster {
private final static Logger LOGGER =
Logger.getLogger(CDXCluster.class.getName());
private Path clusterPath;
SortedTextFile summary;
private FileSystem fs;
private BlockLoader loader;
public CDXCluster(Configuration conf, Path clusterPath) throws IOException {
this.clusterPath = clusterPath;
fs = clusterPath.getFileSystem(conf);
loader = new HDFSBlockLoader(fs);
Path summaryPath = new Path(clusterPath,"ALL.summary");
HDFSSeekableLineReaderFactory factory =
new HDFSSeekableLineReaderFactory(fs, summaryPath);
summary = new SortedTextFile(factory);
}
public CloseableIterator<String> getRangeBlockIterator(String start, String end) throws IOException {
CloseableIterator<String> blocks =
summary.getRecordIterator(start, true);
return new BoundedStringIterator(blocks, end);
}
public byte[] loadBlock(ZipNumBlock block) throws IOException {
return loadBlock(block.shard,block.start,block.length);
}
public byte[] loadBlock(String name, long start, int length) throws IOException {
LOGGER.warning(String.format("Loading(%s,%d,%d",name,start,length));
Path shardPath = new Path(clusterPath,name + ".gz");
return loader.readBlock(shardPath.toUri().toASCIIString(), start, length);
}
public CloseableIterator<String> getRange(String start, String end) throws IOException {
return new ClusterRange(this,start,end);
}
}