package org.archive.util.binsearch; import java.io.IOException; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.util.GeneralURIStreamFactory; import org.archive.util.iterator.CloseableIterator; public class SortedTextFile { private final static Logger LOGGER = Logger.getLogger(SortedTextFile.class.getName()); protected SeekableLineReaderFactory factory; public SortedTextFile(SeekableLineReaderFactory factory) { setFactory(factory); } public SortedTextFile(String filename) throws IOException { this(filename, true); } public SortedTextFile(String filename, boolean useNio) throws IOException { this.factory = GeneralURIStreamFactory.createSeekableStreamFactory(filename, useNio); } protected SortedTextFile() { this.factory = null; } protected void setFactory(SeekableLineReaderFactory factory) { this.factory = factory; } public CloseableIterator<String> getRecordIteratorLT(final String prefix) throws IOException { return getRecordIterator(prefix, true); } public CloseableIterator<String> getRecordIterator(final String prefix) throws IOException { return getRecordIterator(prefix, false); } public SeekableLineReader getSLR() throws IOException { return factory.get(); } public CloseableIterator<String> getRecordIterator(final long offset) throws IOException { SeekableLineReader slr = factory.get(); slr.seek(offset); return new SeekableLineReaderIterator(slr); } public CloseableIterator<String> getRecordIterator(final String prefix, boolean lessThan) throws IOException { SeekableLineReader slr = factory.get(); try { return search(slr,prefix,lessThan); } catch (IOException io) { if (slr != null) { slr.close(); } throw io; } } protected long findOffset(SeekableLineReader slr, final String key) throws IOException { int blockSize = SeekableLineReaderFactory.BINSEARCH_BLOCK_SIZE; long fileSize = slr.getSize(); long min = 0; long max = (long) fileSize / blockSize; long mid; String line; // TODO: implement a cache of midpoints - will make a HUGE difference // on both HTTP and HDFS while (max - min > 1) { mid = min + (long)((max - min) / 2); slr.seek(mid * blockSize); if(mid > 0) line = slr.readLine(); // probably a partial line line = slr.readLine(); if (key.compareTo(line) > 0) { if(LOGGER.isLoggable(Level.FINE)) { LOGGER.fine(String.format("Search(%d) (%s)/(%s) : After", mid * blockSize, key,line)); } min = mid; } else { if(LOGGER.isLoggable(Level.FINE)) { LOGGER.fine(String.format("Search(%d) (%s)/(%s) : Before", mid * blockSize, key,line)); } max = mid; } } // find the right line min = min * blockSize; return min; } public long[] getStartEndOffsets(SeekableLineReader slr, String start, String end) throws IOException { long endOffset = 0; if ((end != null) && !end.isEmpty()) { //endOffset = this.findOffset(slr, end); endOffset = this.searchOffset(slr, end, false); } else { endOffset = slr.getSize(); } long startOffset = 0; if ((start != null) && !start.isEmpty()) { startOffset = this.searchOffset(slr, start, true); } return new long[]{startOffset, endOffset}; } // public CloseableIterator<String> getSplitIterator(long startOffset, long endOffset, int numSplits) throws IOException // { // SeekableLineReader slr = factory.get(); // return new StepSeekingIterator(slr, startOffset, endOffset, numSplits); // } public CloseableIterator<String> getSplitIterator(String start, String end, int numSplits) throws IOException { SeekableLineReader slr = factory.get(); long[] offsets = getStartEndOffsets(slr, start, end); return new StepSeekingIterator(slr, offsets[0], offsets[1], numSplits); } public String[] getNthSplit(String start, String end, int split, int numSplits) throws IOException { SeekableLineReader slr = null; String startLine = null; String endLine = null; try { slr = factory.get(); long[] offsets = getStartEndOffsets(slr, start, end); long startOffset = offsets[0]; long diff = offsets[1] - offsets[0]; long seekDiff = (diff * split) / numSplits; slr.seek(startOffset + seekDiff); if ((startOffset + seekDiff) > 0) { slr.readLine(); } startLine = slr.readLine(); endLine = null; if (split <= (numSplits - 1)) { seekDiff = (diff * (split + 1)) / numSplits; slr.seek(startOffset + seekDiff); slr.readLine(); endLine = slr.readLine(); } else { endLine = end; } } finally { if (slr != null) { slr.close(); } } return new String[]{startLine, endLine}; } class StepSeekingIterator implements CloseableIterator<String> { long startOffset; int numSplits; long endOffset; int currSplit; SeekableLineReader slr; public StepSeekingIterator(SeekableLineReader slr, long startOffset, long endOffset, int numSplits) throws IOException { this.slr = slr; this.currSplit = 0; this.startOffset = startOffset; this.numSplits = numSplits; this.endOffset = endOffset; slr.seek(startOffset); } public boolean hasNext() { return (currSplit < numSplits); } public String next() { String line = null; try { if (startOffset + currSplit != 0) { slr.readLine(); } line = slr.readLine(); currSplit++; long seekDiff = ((endOffset - startOffset) * currSplit) / numSplits; slr.seek(startOffset + seekDiff); } catch (IOException io) { io.printStackTrace(); } return line; } public void remove() { throw new UnsupportedOperationException(); } public void close() throws IOException { slr.close(); } } private long searchOffset(SeekableLineReader slr, final String key, boolean lessThan) throws IOException { long offset = findOffset(slr, key); slr.seek(offset); String line = null; if (offset > 0) { line = slr.readLine(); } String prev = null; while(true) { if (line != null) { offset += line.getBytes().length + 1; } line = slr.readLine(); if(line == null) break; if(line.compareTo(key) >= 0) break; prev = line; } if (!lessThan) { prev = null; } else { offset -= prev.getBytes().length + 1; } // To allow for skipping the line, in case we're not on the boundary return (offset - 2); } private CloseableIterator<String> search(SeekableLineReader slr, final String key, boolean lessThan) throws IOException { long min = findOffset(slr, key); if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine(String.format("Aligning(%d)",min)); } slr.seek(min); String line; if(min > 0) line = slr.readLine(); String prev = null; while(true) { line = slr.readLine(); if(line == null) break; if(line.compareTo(key) >= 0) break; prev = line; } if (!lessThan) { prev = null; } return new CachedStringIterator(slr, prev, line); } public class CachedStringIterator implements CloseableIterator<String> { private String first; private String second; private SeekableLineReader slr; private SeekableLineReaderIterator it; public CachedStringIterator(SeekableLineReader slr, String first, String second) { this.slr = slr; this.first = first; this.second = second; it = new SeekableLineReaderIterator(slr); } public boolean hasNext() { if(first != null) { return true; } if(second != null) { return true; } return it.hasNext(); } public String next() { if(first != null) { String tmp = first; first = null; return tmp; } if(second != null) { String tmp = second; second = null; return tmp; } return it.next(); } public void remove() { throw new UnsupportedOperationException(); } public void close() throws IOException { slr.close(); } } }