/*
* Copyright 2016 MiLaboratory.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.milaboratory.core.io.sequence.fasta;
import com.milaboratory.primitivio.PrimitivI;
import com.milaboratory.primitivio.PrimitivO;
import com.milaboratory.util.LongProcess;
import com.milaboratory.util.LongProcessReporter;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.list.array.TLongArrayList;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import static java.lang.Long.bitCount;
import static java.lang.Long.numberOfLeadingZeros;
public final class RandomAccessFastaIndex {
public static final String INDEX_SUFFIX = ".mifdx";
/**
* Magic integer written before serialized index
*/
public static int MAGIC = 0xDEC730C5;
/**
* Default index step = the most common HDD cluster size
*/
public static final int DEFAULT_INDEX_STEP = 4096;
/**
* Bit mask for encoded file position. Masks "letters to skip" field.
*/
public static final long SKIP_MASK = 0xFFFFFL;
/**
* Bit-shift that should be applied to encoded file position to get "file position" field.
*/
public static final int FILE_POSITION_OFFSET = 20;
/**
* Indexing step
*/
private final int indexStep;
/**
* Indexed file positions
*/
private final long[] indexArray;
/**
* Records
*/
private final IndexRecord[] records;
/**
* Pseudo-record used in String->record index to denote multiple hits
*/
private final IndexRecord MULTI_HITS_RECORD = new IndexRecord(-1, "", 0, 0);
/**
* "String id -> record" index
*/
private final Map<String, IndexRecord> idIndex = new TreeMap<>();
RandomAccessFastaIndex(InputStream is) {
try {
PrimitivI pi = new PrimitivI(is);
int magic = pi.readInt();
if (magic != MAGIC)
throw new IllegalArgumentException("Wrong stream format.");
this.indexStep = pi.readVarInt();
this.indexArray = new long[pi.readVarInt()];
if (indexArray.length == 0) {
this.records = new IndexRecord[0];
return;
}
GZIPInputStream input = new GZIPInputStream(is);
pi = new PrimitivI(input);
this.indexArray[0] = pi.readVarLong();
for (int i = 1; i < this.indexArray.length; i++)
this.indexArray[i] = this.indexArray[i - 1] + pi.readVarLong();
this.records = new IndexRecord[pi.readVarInt()];
for (int i = 0; i < records.length; i++) {
int indexStart = pi.readVarInt();
long length = pi.readVarLong();
String description = pi.readUTF();
records[i] = new IndexRecord(i, description, length, indexStart);
}
} catch (IOException e) {
throw new RuntimeException();
}
// Scanning records and filling idIndex
fillIdIndex();
}
RandomAccessFastaIndex(IndexBuilder builder) {
if (builder.lengths.size() != builder.descriptions.size())
throw new IllegalStateException();
this.indexStep = builder.indexStep;
this.indexArray = builder.index.toArray();
this.records = new IndexRecord[builder.indexIndex.size()];
for (int i = 0; i < this.records.length; i++)
records[i] = new IndexRecord(i, builder.descriptions.get(i),
builder.lengths.get(i),
builder.indexIndex.get(i));
// Scanning records and filling idIndex
fillIdIndex();
}
public int getIndexStep() {
return indexStep;
}
public int size() {
return records.length;
}
public IndexRecord getRecordByIndex(int i) {
return records[i];
}
public IndexRecord getRecordById(String id) {
id = id.trim();
IndexRecord rec = idIndex.get(id);
if (rec == MULTI_HITS_RECORD)
throw new MultipleMatchingRecordsException("Multiple matching records for \"" + id + "\".");
return rec;
}
public IndexRecord getRecordByIdCheck(String id) {
id = id.trim();
IndexRecord rec = idIndex.get(id);
if (rec == MULTI_HITS_RECORD)
throw new MultipleMatchingRecordsException("Multiple matching records for \"" + id + "\".");
if (rec == null)
throw new NoSuchRecordException("No records with id: " + id);
return rec;
}
public void write(OutputStream stream) {
try {
PrimitivO po = new PrimitivO(stream);
po.writeInt(MAGIC);
po.writeVarInt(indexStep);
po.writeVarInt(indexArray.length);
if (indexArray.length == 0)
return;
GZIPOutputStream gzipped = new GZIPOutputStream(stream);
po = new PrimitivO(gzipped);
po.writeVarLong(indexArray[0]);
for (int i = 1; i < indexArray.length; i++)
po.writeVarLong(indexArray[i] - indexArray[i - 1]);
po.writeVarInt(records.length);
for (IndexRecord record : records) {
po.writeVarInt(record.indexStart);
po.writeVarLong(record.length);
po.writeUTF(record.description);
}
gzipped.finish();
gzipped.flush();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private void fillIdIndex() {
Set<String> ids = new HashSet<>();
for (IndexRecord record : records) {
ids.clear();
extractIds(record.description, ids);
record.ids.addAll(ids);
for (String id : ids)
if (idIndex.containsKey(id))
idIndex.put(id, MULTI_HITS_RECORD);
else
idIndex.put(id, record);
}
}
private static final Pattern[] specialIds = new Pattern[]{
Pattern.compile("(lcl|bbs|gi|gb|emb|dbj|sp|pdb|pat|gnl|ref)\\|[^\\|]+"),
Pattern.compile("(gb|emb|dbj|sp|pdb|pat|gnl|ref)\\|[^\\|]+\\|[^\\|]+"),
Pattern.compile("(pir|prf)\\|\\|[^\\|]+"),
Pattern.compile("^\\S+")
};
private static void extractIds(String descriptionLine, Set<String> ids) {
// Adding full sequence description as id
ids.add(descriptionLine.trim());
// Adding all ids separated by |
for (String s : descriptionLine.split("\\|"))
ids.add(s.trim());
// Detecting and adding special ids
for (Pattern pattern : specialIds) {
Matcher matcher = pattern.matcher(descriptionLine);
while (matcher.find())
ids.add(matcher.group().trim());
}
}
public static RandomAccessFastaIndex read(InputStream stream) {
return new RandomAccessFastaIndex(stream);
}
/**
* Extracts file position part from value returned by {@link RandomAccessFastaIndex.IndexRecord#queryPosition(long)}
*
* @param p value returned by {@link RandomAccessFastaIndex.IndexRecord#queryPosition(long)}
* @return file positions
*/
public static long extractFilePosition(long p) {
return p >>> FILE_POSITION_OFFSET;
}
/**
* Extracts number of letters to skip from value returned by {@link RandomAccessFastaIndex.IndexRecord#queryPosition(long)}
*
* @param p value returned by {@link RandomAccessFastaIndex.IndexRecord#queryPosition(long)}
* @return umber of letters to skip
*/
public static int extractSkipLetters(long p) {
return (int) (p & SKIP_MASK);
}
/**
* Index fasta file with automatic step selection or load previously created index
*
* @param file file to index
* @return index
*/
public static RandomAccessFastaIndex index(Path file) {
return index(file, false);
}
/**
* Index fasta file with automatic step selection or load previously created index
*
* @param file file to index
* @param save whether to save index to {input_file_name}.mifdx file
* @return index
*/
public static RandomAccessFastaIndex index(Path file, boolean save) {
return index(file, save, LongProcessReporter.DefaultLongProcessReporter.INSTANCE);
}
/**
* Index fasta file with automatic step selection or load previously created index
*
* @param file file to index
* @param save whether to save index to {input_file_name}.mifdx file
* @param reporter indexing reporter
* @return index
*/
public static RandomAccessFastaIndex index(Path file, boolean save, LongProcessReporter reporter) {
try {
// This calculates indexStep so the final index size will not exceed 1Mb
// (approximately)
long size = Files.size(file);
long step = size / 131072;
if (step < 128)
step = 128;
int iStep = 1 << (64 - numberOfLeadingZeros(step) + (bitCount(step) > 1 ? 1 : 0));
// Index file
return index(file, iStep, save, reporter);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Index fasta file or loads previously created index
*
* @param file file to index
* @param indexStep index step
* @param save whether to save index to {input_file_name}.mifdx file
* @return index
*/
public static RandomAccessFastaIndex index(Path file, int indexStep, boolean save) {
return index(file, indexStep, save, LongProcessReporter.DefaultLongProcessReporter.INSTANCE);
}
/**
* Index fasta file or loads previously created index
*
* @param file file to index
* @param indexStep index step
* @param save whether to save index to {input_file_name}.mifdx file
* @param reporter reporter
* @return index
*/
public static RandomAccessFastaIndex index(Path file, int indexStep, boolean save, LongProcessReporter reporter) {
Path indexFile = file.resolveSibling(file.getFileName() + INDEX_SUFFIX);
if (Files.exists(indexFile))
try (FileInputStream fis = new FileInputStream(indexFile.toFile())) {
RandomAccessFastaIndex index = RandomAccessFastaIndex.read(new BufferedInputStream(fis));
if (index.getIndexStep() != indexStep)
throw new IllegalArgumentException("Mismatched index step in " + indexFile + ". Remove the file to recreate the index.");
return index;
} catch (IOException e) {
throw new RuntimeException(e);
}
try (LongProcess lp = reporter.start("Indexing " + file.getFileName());
FileChannel fc = FileChannel.open(file, StandardOpenOption.READ)) {
// Requesting file size to correctly report status
long size = Files.size(file);
// Allocating buffer
ByteBuffer buffer = ByteBuffer.allocate(65536);
// Extracting backing byte array
byte[] bufferArray = buffer.array();
// Creating builder
StreamIndexBuilder builder = new StreamIndexBuilder(indexStep);
// Indexing file
int read;
long done = 0;
while ((read = fc.read((ByteBuffer) buffer.clear())) > 0) {
builder.processBuffer(bufferArray, 0, read);
lp.reportStatus(1.0 * (done += read) / size);
}
// Build index
RandomAccessFastaIndex index = builder.build();
if (save)
try (FileOutputStream fos = new FileOutputStream(indexFile.toFile())) {
index.write(new BufferedOutputStream(fos));
}
return index;
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof RandomAccessFastaIndex)) return false;
RandomAccessFastaIndex that = (RandomAccessFastaIndex) o;
if (indexStep != that.indexStep) return false;
if (!Arrays.equals(indexArray, that.indexArray)) return false;
// Probably incorrect - comparing Object[] arrays with Arrays.equals
return Arrays.equals(records, that.records);
}
@Override
public int hashCode() {
int result = indexStep;
result = 31 * result + Arrays.hashCode(indexArray);
result = 31 * result + Arrays.hashCode(records);
return result;
}
public final class IndexRecord {
/**
* Sequential index
*/
private final int index;
/**
* Description line
*/
private final String description;
/**
* Sequence length
*/
private final long length;
/**
* Index of first position in indexArray
*/
private final int indexStart;
/**
* List of record ids
*/
private final List<String> ids = new ArrayList<>();
public IndexRecord(int index, String description, long length, int indexStart) {
this.index = index;
this.description = description;
this.length = length;
this.indexStart = indexStart;
}
public List<String> getIds() {
return Collections.unmodifiableList(ids);
}
/**
* Returns fasta record description line. Line that goes after '>' in record header.
*
* @return fasta record description line; line that goes after '>' in record header
*/
public String getDescription() {
return description;
}
/**
* Returns record length in letters.
*
* @return record length in letters
*/
public long getLength() {
return length;
}
/**
* Returns encoded position {@code (positionInFile << 20) | (numberOfLettersToSkip)}. Operation takes {@code
* O(1)}.
*
* @param offset sequence offset in letters
* @return encoded position {@code (positionInFile << 20) | (numberOfLettersToSkip)}
*/
public long queryPosition(long offset) {
if (offset < 0 || offset >= length)
throw new IndexOutOfBoundsException();
int indexOffset = (int) (offset / indexStep);
return (indexArray[indexStart + indexOffset] << FILE_POSITION_OFFSET) | (offset - (long) (indexOffset) * indexStep);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof IndexRecord)) return false;
IndexRecord that = (IndexRecord) o;
if (length != that.length) return false;
if (indexStart != that.indexStart) return false;
return description != null ? description.equals(that.description) : that.description == null;
}
@Override
public int hashCode() {
int result = description != null ? description.hashCode() : 0;
result = 31 * result + (int) (length ^ (length >>> 32));
result = 31 * result + indexStart;
return result;
}
}
public static final class StreamIndexBuilder {
/**
* Internal builder
*/
final IndexBuilder builder;
/**
* -1 = builder finished it's work and invalidated
*/
long currentStreamPosition = 0;
long lastNonLineBreakPosition = -1;
/**
* Counter of sequence position
*/
long currentSequencePosition = -1;
/**
* true if on first char of the line
*/
boolean onLineStart = true;
/**
* -1 = not on header
* >0 = on header
*/
int headerBufferPointer = -1;
/**
* Stores header string before record creation
*/
byte[] headerBuffer = new byte[32768];
public StreamIndexBuilder() {
this(DEFAULT_INDEX_STEP);
}
public StreamIndexBuilder(int indexStep) {
this(new IndexBuilder(indexStep), 0L);
}
public StreamIndexBuilder(int indexStep, long streamPosition) {
this(new IndexBuilder(indexStep), streamPosition);
}
public StreamIndexBuilder(IndexBuilder builder, long streamPosition) {
this.currentStreamPosition = streamPosition;
this.builder = builder;
}
public void processBuffer(String str) {
processBuffer(str.getBytes());
}
public void processBuffer(byte[] buffer) {
processBuffer(buffer, 0, buffer.length);
}
public void processBuffer(byte[] buffer, int offset, int length) {
// Throw exception if builder invalidated
if (currentStreamPosition == -1)
throw new IllegalStateException();
for (int i = 0; i < length; i++) {
long streamPosition = currentStreamPosition++;
byte b = buffer[offset + i];
// Processing line breaks
if (b == '\n' || b == '\r') {
if (!onLineStart)
lastNonLineBreakPosition = streamPosition - 1;
onLineStart = true;
continue;
}
// Detecting record header start
if (onLineStart && b == '>') {
// End of record detected
endOfRecord();
headerBufferPointer = 0;
onLineStart = false;
continue;
}
// End of record header
if (onLineStart && headerBufferPointer >= 0) {
builder.addRecord(new String(headerBuffer, 0, headerBufferPointer), streamPosition);
// We left header
headerBufferPointer = -1;
// We are at the very first letter of sequence
currentSequencePosition = 0;
}
if (headerBufferPointer == -1) {
long sequencePosition = currentSequencePosition++;
if (sequencePosition != 0 && sequencePosition % builder.indexStep == 0)
builder.addIndexPoint(streamPosition);
} else
headerBuffer[headerBufferPointer++] = b;
}
}
private void endOfRecord() {
if (builder.isOnRecord())
builder.setLastRecordLength(currentSequencePosition);
}
public RandomAccessFastaIndex build() {
endOfRecord();
currentStreamPosition = -1;
return builder.build();
}
}
public static final class IndexBuilder {
private final int indexStep;
private final List<String> descriptions = new ArrayList<>();
private final TLongArrayList lengths = new TLongArrayList();
private final TIntArrayList indexIndex = new TIntArrayList();
private final TLongArrayList index = new TLongArrayList();
public IndexBuilder() {
this(DEFAULT_INDEX_STEP);
}
public IndexBuilder(int indexStep) {
if (indexStep <= 0)
throw new IllegalArgumentException();
this.indexStep = indexStep;
}
public boolean isOnRecord() {
return lengths.size() + 1 == descriptions.size();
}
public void addRecord(String description, long position) {
if (lengths.size() != descriptions.size())
throw new IllegalStateException();
if (!index.isEmpty() && index.get(index.size() - 1) >= position)
throw new IllegalArgumentException();
descriptions.add(description);
indexIndex.add(index.size());
index.add(position);
}
public void addIndexPoint(long position) {
index.add(position);
}
public void setLastRecordLength(long length) {
if (lengths.size() + 1 != descriptions.size())
throw new IllegalStateException();
if ((index.size() - indexIndex.get(indexIndex.size() - 1) - 1) * indexStep > length)
throw new IllegalArgumentException();
lengths.add(length);
}
public RandomAccessFastaIndex build() {
return new RandomAccessFastaIndex(this);
}
}
public static final class MultipleMatchingRecordsException extends RuntimeException {
public MultipleMatchingRecordsException(String message) {
super(message);
}
}
public static final class NoSuchRecordException extends RuntimeException {
public NoSuchRecordException(String message) {
super(message);
}
}
}