/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant.transform;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.commons.io.DataReader;
import org.opencb.opencga.storage.hadoop.variant.archive.VariantHbaseTransformTask;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
* Created on 18/08/16.
*
* Group variants from the input VariantReader and make groups of variants contained in regions of size "chunkSize".
* If a variant is in two or more regions, will be emitted in all of them.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class VariantSliceReader implements DataReader<ImmutablePair<Long, List<Variant>>> {
private int numSlices;
protected final Logger logger = LoggerFactory.getLogger(VariantSliceReader.class);
private final int chunkSize;
private final VariantReader reader;
// chromosome -> slice -> variants
// LinkedHashMap will preserve the reading order for the chromosomes
private final LinkedHashMap<String, TreeMap<Long, List<Variant>>> bufferTree;
private String currentChromosome = null;
public VariantSliceReader(int chunkSize, VariantReader reader) {
this.chunkSize = chunkSize;
this.reader = reader;
bufferTree = new LinkedHashMap<>();
}
@Override
public boolean open() {
return reader.open();
}
@Override
public boolean pre() {
return reader.pre();
}
@Override
public boolean post() {
return reader.post();
}
@Override
public boolean close() {
return reader.close();
}
@Override
public List<ImmutablePair<Long, List<Variant>>> read() {
return read(1);
}
@Override
public List<ImmutablePair<Long, List<Variant>>> read(int batchSize) {
List<ImmutablePair<Long, List<Variant>>> slices = new ArrayList<>(batchSize);
while (slices.size() < batchSize) {
List<Variant> read;
do {
read = reader.read(10);
for (Variant variant : read) {
addVariant(variant);
}
} while (!read.isEmpty() && numSlices < 100);
// Nothing to read! Empty reader.
if (numSlices == 0) {
return slices;
}
// Get current chromosome
if (bufferTree.get(currentChromosome).isEmpty()) {
for (Map.Entry<String, TreeMap<Long, List<Variant>>> entry : bufferTree.entrySet()) {
if (!entry.getValue().isEmpty()) {
currentChromosome = entry.getKey();
break;
}
}
}
TreeMap<Long, List<Variant>> map = bufferTree.get(currentChromosome);
Long slicePosition = map.firstKey();
List<Variant> variants = map.remove(slicePosition);
numSlices--;
slices.add(new ImmutablePair<>(slicePosition, variants));
}
return slices;
}
private void addVariant(Variant variant) {
String chromosome = variant.getChromosome();
long[] coveredSlicePositions = getCoveredSlicePositions(variant);
for (long slicePos : coveredSlicePositions) {
addVariant(variant, chromosome, slicePos);
}
}
private void addVariant(Variant variant, String chromosome, long slicePos) {
List<Variant> list;
TreeMap<Long, List<Variant>> positionMap = bufferTree.compute(chromosome,
(s, map) -> map == null ? new TreeMap<>(Long::compareTo) : map);
list = positionMap.compute(slicePos, (pos, variants) -> variants == null ? new LinkedList<>() : variants);
if (list.isEmpty()) {
// New list, new slice
numSlices++;
}
list.add(variant);
if (currentChromosome == null) {
// Set first chromosome
currentChromosome = chromosome;
}
}
private long[] getCoveredSlicePositions(Variant var) {
return VariantHbaseTransformTask.getCoveredSlicePositions(var.getChromosome(), var.getStart(), var.getEnd(), chunkSize);
}
}