package edu.berkeley.cs.succinct.streams; import edu.berkeley.cs.succinct.SuccinctIndexedFile; import edu.berkeley.cs.succinct.regex.RegExMatch; import edu.berkeley.cs.succinct.regex.parser.RegExParsingException; import edu.berkeley.cs.succinct.util.Source; import edu.berkeley.cs.succinct.util.SuccinctConstants; import edu.berkeley.cs.succinct.util.container.Range; import edu.berkeley.cs.succinct.util.iterator.SearchIterator; import edu.berkeley.cs.succinct.util.iterator.SearchRecordIterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; public class SuccinctIndexedFileStream extends SuccinctFileStream implements SuccinctIndexedFile { protected transient int[] offsets; protected transient long endOfIndexedFileStream; /** * Constructor to map a file containing Succinct data structures via stream. * * @param filePath Path of the file. * @param conf Configuration for the filesystem. * @throws IOException */ public SuccinctIndexedFileStream(Path filePath, Configuration conf) throws IOException { super(filePath, conf); FSDataInputStream is = getStream(filePath); is.seek(endOfFileStream); int len = is.readInt(); offsets = new int[len]; for (int i = 0; i < len; i++) { offsets[i] = is.readInt(); } endOfIndexedFileStream = is.getPos(); is.close(); } /** * Constructor to map a file containing Succinct data structures via stream. * * @param filePath Path of the file. * @throws IOException */ public SuccinctIndexedFileStream(Path filePath) throws IOException { this(filePath, new Configuration()); } @Override public int getCompressedSize() { return super.getCompressedSize() + (12 + offsets.length * SuccinctConstants.INT_SIZE_BYTES); } public int offsetToRecordId(int pos) { int sp = 0, ep = offsets.length - 1; int m; while (sp <= ep) { m = (sp + ep) / 2; if (offsets[m] == pos) { return m; } else if (pos < offsets[m]) { ep = m - 1; } else { sp = m + 1; } } return ep; } public int getNumRecords() { return offsets.length; } @Override public int getRecordOffset(int recordId) { if (recordId >= offsets.length || recordId < 0) { throw new ArrayIndexOutOfBoundsException("Record does not exist: recordId = " + recordId); } return offsets[recordId]; } @Override public byte[] getRecordBytes(int recordId) { if (recordId >= offsets.length || recordId < 0) { throw new ArrayIndexOutOfBoundsException("Record does not exist: recordId = " + recordId); } int begOffset = offsets[recordId]; int endOffset = (recordId == offsets.length - 1) ? getOriginalSize() - 1 : offsets[recordId + 1]; int len = (endOffset - begOffset - 1); return extractBytes(begOffset, len); } @Override public byte[] extractRecordBytes(int recordId, int offset, int length) { if (recordId >= offsets.length || recordId < 0) { throw new ArrayIndexOutOfBoundsException("Record does not exist: recordId = " + recordId); } if (length == 0) { return new byte[0]; } int begOffset = offsets[recordId] + offset; int nextRecordOffset = (recordId == offsets.length - 1) ? getOriginalSize() - 1 : offsets[recordId + 1]; length = Math.min(nextRecordOffset - begOffset - 1, length); return extractBytes(begOffset, length); } /** * Get the record for a given recordId. * * @param recordId The record id. * @return The corresponding record. */ @Override public String getRecord(int recordId) { if (recordId >= offsets.length || recordId < 0) { throw new ArrayIndexOutOfBoundsException("Record does not exist: recordId = " + recordId); } int begOffset = offsets[recordId]; int endOffset = (recordId == offsets.length - 1) ? getOriginalSize() - 1 : offsets[recordId + 1]; int len = (endOffset - begOffset - 1); return extract(begOffset, len); } /** * Get random access into record. * * @param recordId The record id. * @param offset Offset into record. * @param length Number of bytes to fetch. * @return The extracted data. */ @Override public String extractRecord(int recordId, int offset, int length) { if (recordId >= offsets.length || recordId < 0) { throw new ArrayIndexOutOfBoundsException("Record does not exist: recordId = " + recordId); } if (length == 0) { return ""; } int begOffset = offsets[recordId] + offset; int nextRecordOffset = (recordId == offsets.length - 1) ? getOriginalSize() - 1 : offsets[recordId + 1]; length = Math.min(nextRecordOffset - begOffset - 1, length); return extract(begOffset, length); } /** * Search for an input query and return ids of all matching records. * * @param query Input query. * @return Ids of all matching records. */ @Override public Integer[] recordSearchIds(Source query) { Set<Integer> results = new HashSet<>(); Range range = bwdSearch(query); long sp = range.first, ep = range.second; if (ep - sp + 1 <= 0) { return new Integer[0]; } for (long i = 0; i < ep - sp + 1; i++) { results.add(offsetToRecordId((int) lookupSA(sp + i))); } return results.toArray(new Integer[results.size()]); } /** * Search for an input query and return offsets of all matching records. * * @param query Input query. * @return Offsets of all matching records. */ @Override public Integer[] recordSearchIds(final byte[] query) { return recordSearchIds(new Source() { @Override public int length() { return query.length; } @Override public int get(int i) { return query[i]; } }); } /** * Search for an input query and return ids of all matching records. * * @param query Input query. * @return Ids of all matching records. */ @Override public Integer[] recordSearchIds(final char[] query) { return recordSearchIds(new Source() { @Override public int length() { return query.length; } @Override public int get(int i) { return query[i]; } }); } /** * Search for an input query and return an iterator over ids of all matching records. * * @param query Input query. * @return Iterator over ids of all matching records */ @Override public Iterator<Integer> recordSearchIdIterator(Source query) { SearchIterator searchIterator = (SearchIterator) searchIterator(query); return new SearchRecordIterator(searchIterator, this); } /** * Search for an input query and return an iterator over ids of all matching records. * * @param query Input query. * @return Iterator over ids of all matching records */ @Override public Iterator<Integer> recordSearchIdIterator(final byte[] query) { return recordSearchIdIterator(new Source() { @Override public int length() { return query.length; } @Override public int get(int i) { return query[i]; } }); } /** * Search for an input query and return an iterator over ids of all matching records. * * @param query Input query. * @return Iterator over ids of all matching records */ @Override public Iterator<Integer> recordSearchIdIterator(final char[] query) { return recordSearchIdIterator(new Source() { @Override public int length() { return query.length; } @Override public int get(int i) { return query[i]; } }); } /** * Check if the two offsets belong to the same record. * * @param firstOffset The first offset. * @param secondOffset The second offset. * @return True if the two offsets belong to the same record, false otherwise. */ @Override public boolean sameRecord(long firstOffset, long secondOffset) { return offsetToRecordId((int) firstOffset) == offsetToRecordId((int) secondOffset); } /** * Search for all records that contain a particular regular expression. * * @param query The regular expression (UTF-8 encoded). * @return The records ids for records that contain the regular search expression. * @throws RegExParsingException */ @Override public Integer[] recordSearchRegexIds(String query) throws RegExParsingException { Set<RegExMatch> regexOffsetResults = regexSearch(query); Set<Integer> recordIds = new HashSet<>(); for (RegExMatch m : regexOffsetResults) { int recordId = offsetToRecordId((int) m.getOffset()); if (!recordIds.contains(recordId)) { recordIds.add(recordId); } } return recordIds.toArray(new Integer[recordIds.size()]); } }