/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant.archive;
import com.google.protobuf.InvalidProtocolBufferException;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.protobuf.VcfMeta;
import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos;
import org.opencb.biodata.tools.variant.converters.proto.VcfRecordProtoToVariantConverter;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Created on 04/11/15.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class VariantHadoopArchiveDBIterator extends VariantDBIterator implements AutoCloseable {
private final Logger logger = LoggerFactory.getLogger(VariantHadoopArchiveDBIterator.class);
private final VcfRecordProtoToVariantConverter converter;
private long limit;
private long count = 0;
private Iterator<VcfSliceProtos.VcfRecord> vcfRecordIterator = Collections.emptyIterator();
private VcfSliceProtos.VcfSlice vcfSlice;
private final Iterator<Result> iterator;
private final byte[] columnFamily;
private final byte[] fileIdBytes;
private ResultScanner resultScanner;
private int startPosition = 0;
private int endPosition = Integer.MAX_VALUE;
private VcfSliceProtos.VcfRecord nextVcfRecord = null;
public VariantHadoopArchiveDBIterator(ResultScanner resultScanner, ArchiveHelper archiveHelper, QueryOptions options) {
this.resultScanner = resultScanner;
this.iterator = this.resultScanner.iterator();
this.columnFamily = archiveHelper.getColumnFamily();
this.fileIdBytes = archiveHelper.getColumn();
VariantSource variantSource = archiveHelper.getMeta().getVariantSource();
converter = new VcfRecordProtoToVariantConverter(StudyEntry.sortSamplesPositionMap(variantSource.getSamplesPosition()),
variantSource.getStudyId(), variantSource.getFileId());
setLimit(options.getLong("limit"));
}
public VariantHadoopArchiveDBIterator(ResultScanner resultScanner, byte[] columnFamily, byte[] fileIdBytes, VcfMeta meta) {
this.resultScanner = resultScanner;
this.iterator = this.resultScanner.iterator();
this.columnFamily = columnFamily;
this.fileIdBytes = fileIdBytes;
VariantSource variantSource = meta.getVariantSource();
converter = new VcfRecordProtoToVariantConverter(StudyEntry.sortSamplesPositionMap(variantSource.getSamplesPosition()),
variantSource.getStudyId(), variantSource.getFileId());
}
@Override
public boolean hasNext() {
if (nextVcfRecord != null) {
return true;
} else {
nextVcfRecord = nextVcfRecord();
return nextVcfRecord != null;
}
}
@Override
public Variant next() {
if (!(count < limit)) {
throw new NoSuchElementException("Limit reached");
}
final VcfSliceProtos.VcfRecord vcfRecord;
if (nextVcfRecord != null) {
vcfRecord = nextVcfRecord;
nextVcfRecord = null;
} else {
vcfRecord = nextVcfRecord();
}
if (vcfRecord == null) {
throw new NoSuchElementException("Limit reached");
}
Variant variant;
try {
count++;
variant = convert(() -> converter.convert(vcfRecord, vcfSlice.getChromosome(), vcfSlice.getPosition()));
} catch (IllegalArgumentException e) {
e.printStackTrace(System.err);
System.err.println("vcfSlice.getPosition() = " + vcfSlice.getPosition());
System.err.println("vcfRecord.getRelativeStart() = " + vcfRecord.getRelativeStart());
System.err.println("vcfRecord.getRelativeEnd() = " + vcfRecord.getRelativeEnd());
variant = new Variant(vcfSlice.getChromosome(), vcfRecord.getRelativeStart() + vcfSlice.getPosition(),
vcfRecord.getReference(), vcfRecord.getAlternate());
logger.debug("variant: {}", variant.toString());
}
return variant;
}
private VcfSliceProtos.VcfRecord nextVcfRecord() {
VcfSliceProtos.VcfRecord vcfRecord;
int variantStart;
do {
if (!vcfRecordIterator.hasNext()) {
if (!iterator.hasNext()) {
return null;
}
Result result = fetch(iterator::next);
byte[] rid = result.getRow();
try {
byte[] value = result.getValue(columnFamily, fileIdBytes);
vcfSlice = convert(() -> VcfSliceProtos.VcfSlice.parseFrom(value));
vcfRecordIterator = vcfSlice.getRecordsList().iterator();
converter.setFields(vcfSlice.getFields());
} catch (InvalidProtocolBufferException e) {
throw new RuntimeException(e);
}
}
vcfRecord = vcfRecordIterator.next();
variantStart = vcfSlice.getPosition() + vcfRecord.getRelativeStart();
} while (vcfRecord.getRelativeStart() < 0 || variantStart < this.startPosition || variantStart > this.endPosition);
//Skip duplicated variant!
return vcfRecord;
}
@Override
public void close() throws Exception {
super.close();
logger.debug("Close variant iterator. Fetch = {}ms, Convert = {}ms", getTimeFetching() / 1000000.0,
getTimeConverting() / 1000000.0);
resultScanner.close();
}
protected VariantHadoopArchiveDBIterator setLimit(long limit) {
this.limit = limit <= 0 ? Long.MAX_VALUE : limit;
return this;
}
public VariantHadoopArchiveDBIterator setRegion(Region region) {
if (region == null) {
return this;
}
this.startPosition = region.getStart();
this.endPosition = region.getEnd();
return this;
}
public ResultScanner getResultScanner() {
return resultScanner;
}
}