package org.archive.util.binsearch;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.util.iterator.AbstractPeekableIterator;
import org.archive.util.iterator.CloseableIterator;
public class SortedTextFile {
private final static Logger LOGGER =
Logger.getLogger(SortedTextFile.class.getName());
private SeekableLineReaderFactory factory;
/**
*
*/
public SortedTextFile(SeekableLineReaderFactory factory) {
this.factory = factory;
}
public CloseableIterator<String> getRecordIteratorLT(final String prefix)
throws IOException {
return getRecordIterator(prefix, true);
}
public CloseableIterator<String> getRecordIterator(final String prefix)
throws IOException {
return getRecordIterator(prefix, false);
}
public CloseableIterator<String> getRecordIterator(final String prefix,
boolean lessThan) throws IOException {
return search(factory.get(),prefix,lessThan);
}
private CloseableIterator<String> search(SeekableLineReader slr,
final String key, boolean lessThan) throws IOException {
int blockSize = 8192;
long fileSize = slr.getSize();
long min = 0;
long max = (long) fileSize / blockSize;
long mid;
String line;
// TODO: implement a cache of midpoints - will make a HUGE difference
// on both HTTP and HDFS
while (max - min > 1) {
mid = min + (long)((max - min) / 2);
slr.seek(mid * blockSize);
if(mid > 0) line = slr.readLine(); // probably a partial line
line = slr.readLine();
if (key.compareTo(line) > 0) {
if(LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine(String.format("Search(%d) (%s)/(%s) : After",
mid * blockSize, key,line));
}
min = mid;
} else {
if(LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine(String.format("Search(%d) (%s)/(%s) : Before",
mid * blockSize, key,line));
}
max = mid;
}
}
// find the right line
min = min * blockSize;
if(LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine(String.format("Aligning(%d)",min));
}
slr.seek(min);
if(min > 0) line = slr.readLine();
String prev = null;
while(true) {
line = slr.readLine();
if(line == null) break;
if(line.compareTo(key) >= 0) break;
prev = line;
}
if(!lessThan) {
prev = null;
}
return new CachedStringIterator(slr, prev, line);
}
public class SeekableLineReaderIterator extends AbstractPeekableIterator<String> {
SeekableLineReader slr;
public SeekableLineReaderIterator(SeekableLineReader slr) {
this.slr = slr;
}
@Override
public String getNextInner() {
String next = null;
if(slr != null) {
try {
next = slr.readLine();
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
return next;
}
@Override
public void close() throws IOException {
slr.close();
}
}
public class CachedStringIterator implements CloseableIterator<String> {
private String first;
private String second;
private SeekableLineReader slr;
private SeekableLineReaderIterator it;
public CachedStringIterator(SeekableLineReader slr, String first, String second) {
this.slr = slr;
this.first = first;
this.second = second;
it = new SeekableLineReaderIterator(slr);
}
public boolean hasNext() {
if(first != null) {
return true;
}
if(second != null) {
return true;
}
return it.hasNext();
}
public String next() {
if(first != null) {
String tmp = first;
first = null;
return tmp;
}
if(second != null) {
String tmp = second;
second = null;
return tmp;
}
return it.next();
}
public void remove() {
throw new UnsupportedOperationException();
}
public void close() throws IOException {
slr.close();
}
}
}