/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Converter to convert Archive Result to Variants */ package org.opencb.opencga.storage.hadoop.variant.archive; import com.google.protobuf.InvalidProtocolBufferException; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos.VcfSlice; import org.opencb.biodata.tools.variant.converters.proto.VcfSliceToVariantListConverter; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; import org.opencb.opencga.storage.hadoop.variant.archive.mr.VariantLocalConflictResolver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collector; import java.util.stream.Collectors; import java.util.stream.Stream; /** * @author Matthias Haimel mh719+git@cam.ac.uk * */ public class ArchiveResultToVariantConverter { private final Logger LOG = LoggerFactory.getLogger(ArchiveResultToVariantConverter.class); private final int studyId; private final AtomicReference<StudyConfiguration> sc = new AtomicReference<>(); private byte[] columnFamily; private volatile ConcurrentHashMap<Integer, VcfSliceToVariantListConverter> fileidToConverter = new ConcurrentHashMap<>(); private final AtomicBoolean parallel = new AtomicBoolean(false); public ArchiveResultToVariantConverter(int studyId, byte[] columnFamily, StudyConfiguration sc) { this.studyId = studyId; this.columnFamily = columnFamily; this.sc.set(sc); } public void setParallel(boolean parallel) { this.parallel.set(parallel); } public boolean isParallel() { return parallel.get(); } public StudyConfiguration getSc() { return sc.get(); } public List<Variant> convert(Result value, Long start, Long end, boolean resolveConflict) throws IllegalStateException { return convert(value, resolveConflict, var -> variantCoveringRegion(var, start, end, true)); } public static boolean variantCoveringRegion(Variant v, Long start, Long end, boolean inclusive) { int iStart = start.intValue(); int iEnd = end.intValue(); if (inclusive) { return iEnd >= v.getStart() && iStart <= v.getEnd(); } else { return iEnd > v.getStart() && iStart < v.getEnd(); } } public List<Variant> convert(Result value, boolean resolveConflict) throws IllegalStateException { return convert(value, resolveConflict, Variant -> true); // Default -> use all } public List<Variant> convert(Result value, boolean resolveConflict, Predicate<Variant> positionFilter) throws IllegalStateException { Stream<Cell> cellStream = Arrays.stream(value.rawCells()).filter(c -> Bytes.equals(CellUtil.cloneFamily(c), columnFamily)) .filter(c -> !Bytes.startsWith(CellUtil.cloneQualifier(c), GenomeHelper.VARIANT_COLUMN_B_PREFIX)); Function<Cell, Stream<? extends Variant>> cellStreamFunction = c -> { try { final List<Variant> variants = archiveCellToVariants( CellUtil.cloneQualifier(c), CellUtil.cloneValue(c)); if (resolveConflict) { return resolveConflicts(variants).stream().filter(positionFilter); } return variants.stream().filter(positionFilter); } catch (InvalidProtocolBufferException e) { throw new IllegalStateException(e); } }; Collector<Variant, ?, List<Variant>> toList = Collectors.toCollection(CopyOnWriteArrayList::new); if (this.isParallel()) { // if parallel return cellStream.parallel().flatMap(cellStreamFunction).collect(toList); } return cellStream.flatMap(cellStreamFunction).collect(toList); } private List<Variant> archiveCellToVariants(byte[] key, byte[] value) throws InvalidProtocolBufferException { int fileId = ArchiveHelper.getFileIdFromColumnName(key); VcfSliceToVariantListConverter converter = loadConverter(fileId); VcfSlice vcfSlice = VcfSlice.parseFrom(value); return converter.convert(vcfSlice); } private VcfSliceToVariantListConverter loadConverter(int fileId) { return fileidToConverter.computeIfAbsent(fileId, key -> { LinkedHashSet<Integer> sampleIds = getSc().getSamplesInFiles().get(fileId); Map<String, Integer> thisFileSamplePositions = new LinkedHashMap<>(); for (Integer sampleId : sampleIds) { String sampleName = getSc().getSampleIds().inverse().get(sampleId); thisFileSamplePositions.put(sampleName, thisFileSamplePositions.size()); } VcfSliceToVariantListConverter converter = new VcfSliceToVariantListConverter( thisFileSamplePositions, Integer.toString(fileId), Integer.toString(studyId)); return converter; }); } /** * Resolve Conflict per file. * @param variants sorted list of variants * @return Valid set of variants without conflicts (each position only represented once) */ public List<Variant> resolveConflicts(List<Variant> variants) { return new VariantLocalConflictResolver().resolveConflicts(variants); } }