package org.archive.util.binsearch;
import java.io.IOException;
import java.util.Comparator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.util.GeneralURIStreamFactory;
import org.archive.util.iterator.CloseableIterator;
public class SortedTextFile {
public static class NumericComparator implements Comparator<String>
{
@Override
public int compare(String arg0, String arg1) {
long val0 = Long.parseLong(arg0);
long val1 = Long.parseLong(arg1);
if (val0 < val1) {
return -1;
} else if (val0 == val1) {
return 0;
} else {
return 1;
}
}
};
public static class DefaultComparator implements Comparator<String>
{
@Override
public int compare(String arg0, String arg1) {
return arg0.compareTo(arg1);
}
};
public final static Comparator<String> numericComparator = new NumericComparator();
public final static Comparator<String> defaultComparator = new DefaultComparator();
private final static Logger LOGGER =
Logger.getLogger(SortedTextFile.class.getName());
protected SeekableLineReaderFactory factory;
protected int binsearchBlockSize = SeekableLineReaderFactory.BINSEARCH_BLOCK_SIZE;
public SortedTextFile(SeekableLineReaderFactory factory) {
setFactory(factory);
}
public SortedTextFile(String filename) throws IOException
{
this(filename, true);
}
public SortedTextFile(String filename, boolean useNio) throws IOException
{
this.factory = GeneralURIStreamFactory.createSeekableStreamFactory(filename, useNio);
}
protected SortedTextFile()
{
this.factory = null;
}
protected void setFactory(SeekableLineReaderFactory factory)
{
this.factory = factory;
}
public void reloadFactory()
{
try {
this.factory.reload();
} catch (IOException e) {
LOGGER.warning(e.toString());
}
}
public int getBinsearchBlockSize() {
return binsearchBlockSize;
}
public void setBinsearchBlockSize(int binsearchBlockSize) {
this.binsearchBlockSize = binsearchBlockSize;
}
public CloseableIterator<String> getRecordIteratorLT(final String prefix)
throws IOException {
return getRecordIterator(prefix, true);
}
public CloseableIterator<String> getRecordIterator(final String prefix)
throws IOException {
return getRecordIterator(prefix, false);
}
public SeekableLineReader getSLR() throws IOException
{
return factory.get();
}
public CloseableIterator<String> getRecordIterator(final long offset) throws IOException
{
SeekableLineReader slr = factory.get();
slr.seek(offset);
return new SeekableLineReaderIterator(slr);
}
public CloseableIterator<String> getRecordIterator(final String prefix,
boolean lessThan) throws IOException {
SeekableLineReader slr = factory.get();
try {
return search(slr, prefix, lessThan, defaultComparator);
} catch (IOException io) {
if (slr != null) {
slr.close();
}
throw io;
}
}
public long binaryFindOffset(SeekableLineReader slr, final String key, Comparator<String> comparator) throws IOException
{
int blockSize = binsearchBlockSize;
long fileSize = slr.getSize();
long min = 0;
long max = (long) fileSize / blockSize;
long mid;
String line;
// TODO: implement a cache of midpoints - will make a HUGE difference
// on both HTTP and HDFS
while (max - min > 1) {
mid = min + (long)((max - min) / 2);
slr.seek(mid * blockSize);
if(mid > 0) slr.skipLine(); // probably a partial line
line = slr.readLine();
if (comparator.compare(key, line) > 0) {
if(LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine(String.format("Search(%d) (%s)/(%s) : After",
mid * blockSize, key,line));
}
min = mid;
} else {
if(LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine(String.format("Search(%d) (%s)/(%s) : Before",
mid * blockSize, key,line));
}
max = mid;
}
}
// find the right line
min = min * blockSize;
return min;
}
public long[] getStartEndOffsets(SeekableLineReader slr, String start, String end) throws IOException
{
long endOffset = 0;
if ((end != null) && !end.isEmpty()) {
//endOffset = this.findOffset(slr, end);
endOffset = this.searchOffset(slr, end, false, defaultComparator);
} else {
endOffset = slr.getSize();
}
long startOffset = 0;
if ((start != null) && !start.isEmpty()) {
startOffset = this.searchOffset(slr, start, true, defaultComparator);
}
return new long[]{startOffset, endOffset};
}
// public CloseableIterator<String> getSplitIterator(long startOffset, long endOffset, int numSplits) throws IOException
// {
// SeekableLineReader slr = factory.get();
// return new StepSeekingIterator(slr, startOffset, endOffset, numSplits);
// }
public CloseableIterator<String> getSplitIterator(String start, String end, int numSplits) throws IOException
{
SeekableLineReader slr = factory.get();
long[] offsets = getStartEndOffsets(slr, start, end);
return new StepSeekingIterator(slr, offsets[0], offsets[1], numSplits);
}
public String[] getRange(String start, String end) throws IOException
{
SeekableLineReader slr = null;
String startLine = null;
String endLine = null;
try {
slr = factory.get();
if (start.isEmpty()) {
slr.seek(0);
startLine = slr.readLine();
} else {
startLine = search(slr, start, true, defaultComparator).next();
}
if (end.isEmpty()) {
endLine = getLastLine(slr);
} else {
endLine = search(slr, end, true, defaultComparator).next();
}
} finally {
if (slr != null) {
slr.close();
}
}
return new String[]{startLine, endLine};
}
// end exclusive
public String[] getNthSplit(String start, String end, int split, int numSplits) throws IOException
{
SeekableLineReader slr = null;
String startLine = null;
String endLine = null;
try {
slr = factory.get();
long[] offsets = getStartEndOffsets(slr, start, end);
long startOffset = offsets[0];
long diff = offsets[1] - offsets[0];
long seekDiff = (diff * split) / numSplits;
slr.seek(startOffset + seekDiff);
if ((startOffset + seekDiff) > 0) {
slr.skipLine();
}
startLine = slr.readLine();
endLine = null;
if (split <= (numSplits - 1)) {
seekDiff = (diff * (split + 1)) / numSplits;
slr.seek(startOffset + seekDiff);
slr.skipLine();
endLine = slr.readLine();
} else {
endLine = end;
}
// Last line
if (endLine == null) {
endLine = getLastLine(slr);
}
} finally {
if (slr != null) {
slr.close();
}
}
return new String[]{startLine, endLine};
}
public String getLastLine(SeekableLineReader slr) throws IOException
{
int lastLineLenTest = 0;
int lastLineLenInc = 400;
String endLine = null;
do {
lastLineLenTest += lastLineLenInc;
slr.seek(slr.getSize() - lastLineLenTest); // TODO: assume larger buffer
slr.readLine(); // skip partial line
String nextLine = null;
endLine = null;
while ((nextLine = slr.readLine()) != null) {
endLine = nextLine;
}
} while (endLine == null);
return endLine;
}
class StepSeekingIterator implements CloseableIterator<String>
{
long startOffset;
int numSplits;
long endOffset;
int currSplit;
SeekableLineReader slr;
public StepSeekingIterator(SeekableLineReader slr, long startOffset, long endOffset, int numSplits) throws IOException
{
this.slr = slr;
this.currSplit = 0;
this.startOffset = startOffset;
this.numSplits = numSplits;
this.endOffset = endOffset;
slr.seek(startOffset);
}
public boolean hasNext() {
return (currSplit < numSplits);
}
public String next() {
String line = null;
try {
if (startOffset + currSplit != 0) {
slr.skipLine();
}
line = slr.readLine();
currSplit++;
long seekDiff = ((endOffset - startOffset) * currSplit) / numSplits;
slr.seek(startOffset + seekDiff);
} catch (IOException io) {
io.printStackTrace();
}
return line;
}
public void remove() {
throw new UnsupportedOperationException();
}
public void close() throws IOException {
slr.close();
}
}
private long searchOffset(SeekableLineReader slr,
final String key, boolean lessThan, Comparator<String> comparator) throws IOException {
long offset = binaryFindOffset(slr, key, comparator);
slr.seek(offset);
String line = null;
if (offset > 0) {
slr.skipLine();
}
String prev = null;
while(true) {
if (line != null) {
offset += line.getBytes().length + 1;
}
line = slr.readLine();
if(line == null) break;
if(comparator.compare(line, key) >= 0) break;
prev = line;
}
if (lessThan && prev != null) {
offset -= prev.getBytes().length + 1;
}
return offset;
}
private CloseableIterator<String> search(SeekableLineReader slr,
final String key, boolean lessThan, Comparator<String> comparator) throws IOException {
long min = binaryFindOffset(slr, key, comparator);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine(String.format("Aligning(%d)",min));
}
slr.seek(min);
String line;
if (min > 0) {
slr.skipLine();
}
String prev = null;
while(true) {
line = slr.readLine();
if (line == null) break;
if (comparator.compare(line, key) >= 0) break;
prev = line;
}
if (!lessThan) {
prev = null;
}
return new CachedStringIterator(slr, prev, line);
}
public static class CachedStringIterator implements CloseableIterator<String> {
private String first;
private String second;
private SeekableLineReader slr;
private SeekableLineReaderIterator it;
public CachedStringIterator(String first, String second) {
this.slr = null;
this.first = first;
this.second = second;
}
public CachedStringIterator(SeekableLineReader slr, String first, String second) {
this.slr = slr;
this.first = first;
this.second = second;
if (slr != null) {
it = new SeekableLineReaderIterator(slr);
}
}
public boolean hasNext() {
if(first != null) {
return true;
}
if(second != null) {
return true;
}
if (it == null) {
return false;
}
return it.hasNext();
}
public String next() {
if(first != null) {
String tmp = first;
first = null;
return tmp;
}
if(second != null) {
String tmp = second;
second = null;
return tmp;
}
if (it == null) {
return null;
}
return it.next();
}
public void remove() {
throw new UnsupportedOperationException();
}
public void close() throws IOException {
if (slr != null) {
slr.close();
}
}
}
}