/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * */ package org.opencb.opencga.storage.hadoop.variant.index; import com.google.common.base.Objects; import com.google.protobuf.InvalidProtocolBufferException; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; import org.apache.phoenix.schema.types.PUnsignedIntArray; import org.apache.phoenix.schema.types.PhoenixArray; import org.opencb.biodata.models.feature.Genotype; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.protobuf.VariantProto; import org.opencb.biodata.models.variant.protobuf.VariantProto.AlternateCoordinate; import org.opencb.biodata.models.variant.protobuf.VariantProto.VariantType; import org.opencb.biodata.tools.variant.merge.VariantMerger; import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; import org.opencb.opencga.storage.hadoop.variant.index.phoenix.PhoenixHelper; import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper; import org.opencb.opencga.storage.hadoop.variant.models.protobuf.*; import org.opencb.opencga.storage.hadoop.variant.models.protobuf.ComplexFilter.Builder; import java.io.IOException; import java.io.UncheckedIOException; import java.sql.Array; import java.sql.ResultSet; import java.sql.ResultSetMetaData; import java.sql.SQLException; import java.util.*; import java.util.Map.Entry; import java.util.function.Function; import java.util.stream.Collectors; import static org.opencb.biodata.tools.variant.merge.VariantMerger.GT_KEY; /** * * @author Matthias Haimel mh719+git@cam.ac.uk */ public class VariantTableStudyRow { public static final String NOCALL = "."; public static final String HOM_REF = "0/0"; public static final String HET_REF = "0/1"; public static final String HOM_VAR = "1/1"; public static final String OTHER = "?"; public static final String COMPLEX = "X"; public static final String PASS_CNT = "P"; public static final String FILTER_OTHER = "F"; public static final String CALL_CNT = "C"; public static final List<String> STUDY_COLUMNS = Collections.unmodifiableList( Arrays.asList(NOCALL, HOM_REF, HET_REF, HOM_VAR, OTHER, COMPLEX, PASS_CNT, CALL_CNT, FILTER_OTHER)); public static final List<String> GENOTYPE_COLUMNS = Collections.unmodifiableList(Arrays.asList(NOCALL, HET_REF, HOM_VAR, OTHER)); public static final char COLUMN_KEY_SEPARATOR = '_'; private Integer studyId; private Integer homRefCount = 0; private Integer passCount = 0; private Integer callCount = 0; private final String chromosome; private final int pos; private final String ref; private final String alt; private final org.opencb.biodata.models.variant.avro.VariantType type; private Map<String, Set<Integer>> callMap = new HashMap<>(); private Map<Integer, String> sampleToGenotype = new HashMap<>(); private Map<String, Set<Integer>> filterToSamples = new HashMap<>(); private List<AlternateCoordinate> secAlternate = new ArrayList<>(); public VariantTableStudyRow(Integer studyId, String chr, int pos, String ref, String alt, org.opencb.biodata.models.variant.avro.VariantType type) { this.studyId = studyId; this.chromosome = chr; this.pos = pos; this.ref = ref; this.alt = alt; this.type = type; } public VariantTableStudyRow(VariantTableStudyRow row) { this(row.studyId, row.chromosome, row.pos, row.ref, row.alt, row.type); this.homRefCount = row.homRefCount; this.callCount = row.callCount; this.passCount = row.passCount; this.callMap.putAll(row.callMap.entrySet().stream().collect(Collectors.toMap(p -> p.getKey(), p -> new HashSet<>(p.getValue())))); this.secAlternate.addAll(row.secAlternate != null ? row.secAlternate : Collections.emptyList()); this.sampleToGenotype.putAll(row.sampleToGenotype != null ? row.sampleToGenotype : Collections.emptyMap()); } public VariantTableStudyRow(VariantTableStudyRowProto proto, String chromosome, Integer studyId) { this.studyId = studyId; this.chromosome = chromosome; this.pos = proto.getStart(); this.ref = proto.getReference(); this.alt = proto.getAlternate(); this.type = toAvro(proto.getType()); this.callCount = proto.getCallCount(); this.passCount = proto.getPassCount(); this.homRefCount = proto.getHomRefCount(); this.callMap = new HashMap<>(4); callMap.put(HOM_VAR, new HashSet<>(proto.getHomVarList())); callMap.put(HET_REF, new HashSet<>(proto.getHetList())); callMap.put(NOCALL, new HashSet<>(proto.getNocallList())); callMap.put(OTHER, new HashSet<>(proto.getOtherList())); for (Map.Entry<String, SampleList> entry : proto.getOtherGt().entrySet()) { String gt = entry.getKey(); for (Integer sid : entry.getValue().getSampleIdsList()) { sampleToGenotype.put(sid, gt); } } this.filterToSamples = proto.getFilterNonPass().entrySet().stream() .collect(Collectors.toMap(k -> k.getKey(), e -> new HashSet<>(e.getValue().getSampleIdsList()))); this.secAlternate = proto.getSecondaryAlternateList(); } /** * Calls {@link #VariantTableStudyRow(Integer, String, int, String, String, * org.opencb.biodata.models.variant.avro.VariantType)} using the Variant information. * @param studyId Study id * @param variant Variant to extrac the region from */ public VariantTableStudyRow(Integer studyId, Variant variant) { this(studyId, variant.getChromosome(), variant.getStart(), variant.getReference(), variant.getAlternate(), variant.getType()); } public int getPos() { return pos; } public ComplexFilter getComplexFilter() { Builder b = ComplexFilter.newBuilder(); Map<String, SampleList> map = toSampleListMap(this.filterToSamples); b.putAllFilterNonPass(map); return b.build(); } private void setComplexFilter(ComplexFilter cf) { Map<String, Set<Integer>> map = cf.getFilterNonPass().entrySet().stream().collect( Collectors.toMap(e -> e.getKey(), e -> new HashSet<>(e.getValue().getSampleIdsList()))); this.filterToSamples.putAll(map); } public ComplexVariant getComplexVariant() { return ComplexVariant.newBuilder() .putAllSampleToGenotype(this.sampleToGenotype) .addAllSecondaryAlternates(this.secAlternate).build(); } public void setComplexVariant(ComplexVariant complexVariant) { Map<Integer, String> map = complexVariant.getSampleToGenotype(); if (map != null && map.size() > 0) { this.sampleToGenotype.putAll(map); } List<AlternateCoordinate> secAlt = complexVariant.getSecondaryAlternatesList(); if (secAlt != null && !secAlt.isEmpty()) { this.secAlternate.addAll(secAlt); } } public Set<String> getGenotypes() { return callMap.keySet(); } public Set<Integer> getSampleIds(String gt) { Set<Integer> set = this.callMap.get(gt); if (null == set) { return Collections.emptySet(); } return set; } public Set<Integer> getSampleIds(Genotype gt) { return getSampleIds(gt.toString()); } public Integer getStudyId() { return studyId; } /** * @param gt Genotype code for the samples * @param sampleIds Sample numeric codes * @throws IllegalStateException in case the sample already exists in the collection */ public void addSampleId(String gt, Collection<Integer> sampleIds) { Set<Integer> set = this.callMap.get(gt); if (null == set) { set = new HashSet<>(); this.callMap.put(gt, set); } set.addAll(sampleIds); } /** * @param gt Genotype code for the samples * @param sampleId Sample numeric codes * @throws IllegalStateException in case the sample already exists in the collection */ public void addSampleId(String gt, Integer sampleId) { Set<Integer> set = this.callMap.get(gt); if (null == set) { set = new HashSet<>(); this.callMap.put(gt, set); } if (!set.add(sampleId)) { throw new IllegalStateException(String.format("Sample id %s already in gt set %s", sampleId, gt)); } } public byte[] generateRowKey(VariantTableHelper helper) { return helper.generateVariantRowKey(this.chromosome, this.pos, this.ref, this.alt); } public void addHomeRefCount(Integer cnt) { this.homRefCount += cnt; } public Integer getHomRefCount() { return homRefCount; } public void addPassCount(Integer cnt) { passCount += cnt; } public Integer getPassCount() { return passCount; } public void setPassCount(Integer passCount) { this.passCount = passCount; } public void addCallCount(Integer cnt) { callCount += cnt; } public void setCallCount(Integer callCount) { this.callCount = callCount; } public Integer getCallCount() { return callCount; } public void setHomRefCount(Integer homRefCount) { this.homRefCount = homRefCount; } public String getAlt() { return alt; } public String getRef() { return ref; } public String getChromosome() { return chromosome; } public VariantTableStudyRow setStudyId(Integer studyId) { this.studyId = studyId; return this; } /** * Fills only changed columns of a PUT object. If no column changed, returns NULL * @param helper VariantTableHelper * @param newSampleIds Sample IDs which are loaded were not in the original variant * @return NULL if no changes, else PUT object with changed columns */ public Put createSpecificPut(VariantTableHelper helper, Set<Integer> newSampleIds) { boolean doPut = false; byte[] generateRowKey = generateRowKey(helper); byte[] cf = helper.getColumnFamily(); Integer sid = helper.getStudyId(); Put put = new Put(generateRowKey); Set<Integer> newHomRef = new HashSet<>(newSampleIds); /***** Complex GT *****/ Set<Integer> foundIds = this.sampleToGenotype.entrySet().stream().filter(e -> newSampleIds.contains(e.getKey())) .map(e -> e.getKey()).collect(Collectors.toSet()); /***** Secondary Alt list *****/ // newRow.secAlternate // not needed to filter down //TODO check if new alternate is referenced // Function to extract index list of all alleles Function<Map.Entry<Integer, String>, Set<Integer>> function = (e) -> Genotype.parse(e.getValue()).stream() .flatMap(g -> g.toProtobuf().getAllelesIdxList().stream()).collect(Collectors.toSet()); Set<Integer> oldIdx = this.sampleToGenotype.entrySet().stream().filter(e -> !newSampleIds.contains(e.getKey())) .map(function).flatMap(l -> l.stream()).collect(Collectors.toSet()); Set<Integer> newIdx = this.sampleToGenotype.entrySet().stream().filter(e -> newSampleIds.contains(e.getKey())) .map(function).flatMap(l -> l.stream()).collect(Collectors.toSet()); newIdx.removeAll(oldIdx); if (!newIdx.isEmpty() || !foundIds.isEmpty()) { doPut = true; put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, COMPLEX)), this.getComplexVariant().toByteArray()); newHomRef.removeAll(foundIds); } /***** Filter *****/ long cntFilter = this.filterToSamples.entrySet().stream().filter(e -> !Collections.disjoint(e.getValue(), newSampleIds)).count(); if (cntFilter > 0) { doPut = true; put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, FILTER_OTHER)), this.getComplexFilter().toByteArray()); } /**** PASS CNT ***/ Set<Integer> newPassIds = new HashSet<>(newSampleIds); this.filterToSamples.values().forEach(newPassIds::removeAll); if (!newPassIds.isEmpty()) { doPut = true; put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, PASS_CNT)), Bytes.toBytes(this.passCount)); } /**** GT ***/ Set<Integer> newCalls = new HashSet<>(newSampleIds); for (Entry<String, Set<Integer>> entry : this.callMap.entrySet()) { byte[] column = Bytes.toBytes(buildColumnKey(sid, entry.getKey())); boolean disjoint = Collections.disjoint(entry.getValue(), newSampleIds); if (!disjoint) { doPut = true; List<Integer> value = new ArrayList<>(entry.getValue()); Collections.sort(value); byte[] bytesArray = PhoenixHelper.toBytes(value, PUnsignedIntArray.INSTANCE); put.addColumn(cf, column, bytesArray); newHomRef.removeAll(value); if (StringUtils.equals(entry.getKey(), NOCALL)) { newCalls.removeAll(value); } } } if (!newHomRef.isEmpty()) { doPut = true; put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, HOM_REF)), Bytes.toBytes(this.homRefCount)); } if (!this.callCount.equals(newCalls.size())) { doPut = true; put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, CALL_CNT)), Bytes.toBytes(this.callCount)); } if (this.callMap.containsKey(HOM_REF)) { throw new IllegalStateException( String.format("HOM_REF data found for row %s for sample IDs %s", Arrays.toString(generateRowKey), StringUtils.join(this.callMap.get(HOM_REF), ","))); } if (doPut) { return put; } return null; } public Put createPut(VariantTableHelper helper) { byte[] generateRowKey = generateRowKey(helper); if (this.callMap.containsKey(HOM_REF)) { throw new IllegalStateException( String.format("HOM_REF data found for row %s for sample IDs %s", Arrays.toString(generateRowKey), StringUtils.join(this.callMap.get(HOM_REF), ","))); } byte[] cf = helper.getColumnFamily(); Integer sid = helper.getStudyId(); Put put = new Put(generateRowKey); put.addColumn(cf, VariantPhoenixHelper.VariantColumn.TYPE.bytes(), Bytes.toBytes(this.type.toString())); put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, HOM_REF)), Bytes.toBytes(this.homRefCount)); put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, PASS_CNT)), Bytes.toBytes(this.passCount)); put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, CALL_CNT)), Bytes.toBytes(this.callCount)); if (!this.secAlternate.isEmpty() || this.sampleToGenotype.size() > 0) { //add complex genotype column if required put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, COMPLEX)), this.getComplexVariant().toByteArray()); } if (!this.filterToSamples.isEmpty()) { put.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, FILTER_OTHER)), this.getComplexFilter().toByteArray()); } for (Entry<String, Set<Integer>> entry : this.callMap.entrySet()) { byte[] column = Bytes.toBytes(buildColumnKey(sid, entry.getKey())); List<Integer> value = new ArrayList<>(entry.getValue()); if (!value.isEmpty()) { Collections.sort(value); byte[] bytesArray = PhoenixHelper.toBytes(value, PUnsignedIntArray.INSTANCE); put.addColumn(cf, column, bytesArray); } } return put; } public Delete createDelete(VariantTableHelper helper) { byte[] generateRowKey = generateRowKey(helper); byte[] cf = helper.getColumnFamily(); Integer sid = helper.getStudyId(); Delete delete = new Delete(generateRowKey); for (String key : STUDY_COLUMNS) { delete.addColumn(cf, Bytes.toBytes(buildColumnKey(sid, key))); } return delete; } public static VariantTableStudyRowsProto toProto(List<VariantTableStudyRow> rows, long timeStamp) { return VariantTableStudyRowsProto.newBuilder() .addAllRows(rows.stream().map(VariantTableStudyRow::toProto).collect(Collectors.toList())) .setTimestamp(timeStamp) .build(); } public VariantTableStudyRowProto toProto() { Map<String, List<Integer>> otherGt = new HashMap<>(); for (Entry<Integer, String> entry : sampleToGenotype.entrySet()) { String gt = entry.getValue(); List<Integer> samples = otherGt.get(gt); if (samples == null) { samples = new LinkedList<>(); otherGt.put(gt, samples); } samples.add(entry.getKey()); } return VariantTableStudyRowProto.newBuilder() .setStart(pos) .setReference(ref) .setAlternate(alt) .setType(toProto(type)) .setCallCount(callCount) .setPassCount(passCount) .setHomRefCount(homRefCount) .addAllHomVar(callMap.getOrDefault(HOM_VAR, Collections.emptySet())) .addAllHet(callMap.getOrDefault(HET_REF, Collections.emptySet())) .addAllNocall(callMap.getOrDefault(NOCALL, Collections.emptySet())) .addAllOther(callMap.getOrDefault(OTHER, Collections.emptySet())) .addAllSecondaryAlternate(secAlternate) .putAllOtherGt(toSampleListMap(otherGt)) .putAllFilterNonPass(toSampleListMap(this.filterToSamples)) .build(); } public VariantType toProto(org.opencb.biodata.models.variant.avro.VariantType type) { return VariantType.valueOf(type.toString()); } public org.opencb.biodata.models.variant.avro.VariantType toAvro(VariantType type) { return org.opencb.biodata.models.variant.avro.VariantType.valueOf(type.toString()); } private Map<String, SampleList> toSampleListMap(Map<String, ? extends Collection<Integer>> map) { return map.entrySet().stream() .collect(Collectors.toMap( Entry::getKey, entry -> SampleList.newBuilder().addAllSampleIds(entry.getValue()).build())); } public static List<VariantTableStudyRow> parse(Result result, GenomeHelper helper) { NavigableMap<byte[], byte[]> familyMap = result.getFamilyMap(helper.getColumnFamily()); Set<Integer> studyIds = familyMap.entrySet().stream() .filter(entry -> entry.getValue() != null && entry.getValue().length > 0) .map(entry -> extractStudyId(Bytes.toString(entry.getKey()), false)) .filter(integer -> integer != null) .collect(Collectors.toSet()); if (studyIds.isEmpty()) { throw new IllegalStateException("No studies found!!!"); } List<VariantTableStudyRow> rows = new ArrayList<>(studyIds.size()); for (Integer studyId : studyIds) { Variant variant = helper.extractVariantFromVariantRowKey(result.getRow()); rows.add(new VariantTableStudyRow(variant, studyId, familyMap, true)); } return rows; } public VariantTableStudyRow(Variant variant, Integer studyId, NavigableMap<byte[], byte[]> familyMap, boolean skipOtherStudies) { this(studyId, variant); for (Entry<byte[], byte[]> entry : familyMap.entrySet()) { if (entry.getValue() == null || entry.getValue().length == 0) { continue; // use default values, if no data for column exist } String colStr = Bytes.toString(entry.getKey()); String[] colSplit = colStr.split("_", 2); if (!colSplit[0].equals(studyId.toString())) { // check study ID for consistency check if (skipOtherStudies) { continue; } else { throw new IllegalStateException(String.format("Expected study id %s, but found %s in row %s", studyId.toString(), colSplit[0], colStr)); } } String gt = colSplit[1]; switch (gt) { case HOM_REF: homRefCount = parseCount(entry.getValue()); break; case CALL_CNT: callCount = parseCount(entry.getValue()); break; case PASS_CNT: passCount = parseCount(entry.getValue()); break; case COMPLEX: try { ComplexVariant complexVariant = ComplexVariant.parseFrom(entry.getValue()); setComplexVariant(complexVariant); } catch (InvalidProtocolBufferException e) { throw new UncheckedIOException(e); } break; case FILTER_OTHER: try { ComplexFilter complexFilter = ComplexFilter.parseFrom(entry.getValue()); setComplexFilter(complexFilter); } catch (InvalidProtocolBufferException e) { throw new UncheckedIOException(e); } break; case NOCALL: case HET_REF: case HOM_VAR: case OTHER: try { PhoenixArray phoenixArray = (PhoenixArray) PUnsignedIntArray.INSTANCE.toObject(entry.getValue()); HashSet<Integer> value = new HashSet<>(); if (phoenixArray.getArray() != null) { int[] array = (int[]) phoenixArray.getArray(); for (int i : array) { value.add(i); } } callMap.put(gt, value); } catch (Exception e) { //possible!!! throw new IllegalStateException( "Issue parsing " + gt + "(" + colStr + ")" + " for " + variant + "; hexstring:[" + Bytes.toHex(entry.getValue()) + "]", e); } break; default: // ignore otherwise break; } } } public static List<VariantTableStudyRow> parse(Variant variant, ResultSet resultSet, GenomeHelper helper) throws SQLException { ResultSetMetaData metaData = resultSet.getMetaData(); Set<Integer> studyIds = new HashSet<>(); for (int i = 0; i < metaData.getColumnCount(); i++) { String columnName = metaData.getColumnName(i + 1); if (columnName != null && !columnName.isEmpty()) { if (resultSet.getBytes(columnName) != null) { Integer studyId = extractStudyId(columnName, false); if (studyId != null) { studyIds.add(studyId); } } } } List<VariantTableStudyRow> rows = new ArrayList<>(studyIds.size()); for (Integer studyId : studyIds) { rows.add(new VariantTableStudyRow(variant, resultSet, studyId)); } return rows; } /** * Parse Phoenix ResultSet. * @param variant Variant to create {@link VariantTableStudyRow#VariantTableStudyRow(Integer, Variant)} with * @param resultSet Phoenix result set * @param studyId Study id * @throws SQLException Problems accessing data in {@link ResultSet} */ public VariantTableStudyRow(Variant variant, ResultSet resultSet, int studyId) throws SQLException { this(studyId, variant); homRefCount = resultSet.getInt(buildColumnKey(studyId, HOM_REF)); callCount = resultSet.getInt(buildColumnKey(studyId, CALL_CNT)); passCount = resultSet.getInt(buildColumnKey(studyId, PASS_CNT)); byte[] xArr = resultSet.getBytes(buildColumnKey(studyId, COMPLEX)); if (xArr != null && xArr.length > 0) { try { ComplexVariant complexVariant = ComplexVariant.parseFrom(xArr); setComplexVariant(complexVariant); } catch (IOException e) { throw new UncheckedIOException(e); } } byte[] fArr = resultSet.getBytes(buildColumnKey(studyId, FILTER_OTHER)); if (fArr != null && fArr.length > 0) { try { ComplexFilter complexFilter = ComplexFilter.parseFrom(fArr); setComplexFilter(complexFilter); } catch (IOException e) { throw new UncheckedIOException(e); } } for (String gt : new String[] { HET_REF, HOM_VAR, OTHER, NOCALL }) { Array sqlArray = resultSet.getArray(buildColumnKey(studyId, gt)); HashSet<Integer> value = new HashSet<>(); if (sqlArray != null && sqlArray.getArray() != null) { int[] array = (int[]) sqlArray.getArray(); for (int i : array) { value.add(i); } } callMap.put(gt, value); } } private static Integer parseCount(byte[] value) { if (value == null || value.length == 0) { return 0; } else { return Bytes.toInt(value); } } public static String buildColumnKey(Integer sid, String gt) { return String.valueOf(sid) + COLUMN_KEY_SEPARATOR + gt; } public static Integer extractStudyId(String columnKey, boolean failOnMissing) { String study = StringUtils.split(columnKey, COLUMN_KEY_SEPARATOR)[0]; if (StringUtils.isNotBlank(columnKey) && Character.isDigit(columnKey.charAt(0)) && StringUtils.isNumeric(study)) { return Integer.parseInt(study); } else { if (failOnMissing) { throw new IllegalStateException(String.format("Integer expected for study ID: extracted %s from %s ", study, columnKey)); } else { return null; } } } /** * Creates a new VariantTableStudyRow from a single Variant object. * * @param variant The variant to convert * @param studyId Study identifier * @param sampleIds Sample id mapping */ public VariantTableStudyRow(Variant variant, Integer studyId, Map<String, Integer> sampleIds) { this(studyId, variant); int[] homRef = new Genotype("0/0").getAllelesIdx(); int[] hetRef = new Genotype("0/1").getAllelesIdx(); int[] hetRefOther = new Genotype("1|0").getAllelesIdx(); int[] homVar = new Genotype("1/1").getAllelesIdx(); int[] nocall = new Genotype(".").getAllelesIdx(); int[] nocallBoth = new Genotype("./.").getAllelesIdx(); Set<Integer> homref = new HashSet<>(); StudyEntry se = variant.getStudy(studyId.toString()); if (null == se) { throw new IllegalStateException("Study Entry of variant is null: " + variant); } try { Set<String> sampleSet = se.getSamplesName(); // Create Secondary index List<VariantProto.AlternateCoordinate> arr = Collections.emptyList(); if (null != se.getSecondaryAlternates() && se.getSecondaryAlternates().size() > 0) { arr = new ArrayList<>(se.getSecondaryAlternates().size()); for (org.opencb.biodata.models.variant.avro.AlternateCoordinate altCoord : se.getSecondaryAlternates()) { VariantProto.AlternateCoordinate.Builder ac = AlternateCoordinate.newBuilder(); ac.setChromosome(Objects.firstNonNull(altCoord.getChromosome(), "")) .setStart(Objects.firstNonNull(altCoord.getStart(), 0)) .setEnd(Objects.firstNonNull(altCoord.getEnd(), 0)) .setReference(Objects.firstNonNull(altCoord.getReference(), "")) .setAlternate(Objects.firstNonNull(altCoord.getAlternate(), "")); VariantType vt = toProto(altCoord.getType()); ac.setType(vt); arr.add(ac.build()); } secAlternate = arr; } for (String sample : sampleSet) { Integer sid = sampleIds.get(sample); if (null == sid) { throw new IllegalStateException("Sample id found for " + sample); } // Work out Genotype String gtStr = se.getSampleData(sample, GT_KEY); List<Genotype> gtLst = Genotype.parse(gtStr); if (gtLst.isEmpty()) { // No GT found for this individual throw new IllegalStateException("No GT found for " + sample + ": " + variant.toJson()); } else if (gtLst.size() == 1) { Genotype gt = gtLst.get(0); int[] alleleIdx = gt.getAllelesIdx(); if (Arrays.equals(alleleIdx, homRef)) { addCallCount(1); if (!homref.add(sid)) { throw new IllegalStateException("Sample already exists as hom_ref " + sample); } } else if (Arrays.equals(alleleIdx, hetRef) || Arrays.equals(alleleIdx, hetRefOther)) { addSampleId(HET_REF, sid); addCallCount(1); } else if (Arrays.equals(alleleIdx, homVar)) { addSampleId(HOM_VAR, sid); addCallCount(1); } else if (Arrays.equals(alleleIdx, nocall) || Arrays.equals(alleleIdx, nocallBoth)) { addSampleId(NOCALL, sid); } else { addSampleId(OTHER, sid); addCallCount(1); sampleToGenotype.put(sid, gtStr); } } else { addSampleId(OTHER, sid); addCallCount(1); sampleToGenotype.put(sid, gtStr); } // Work out PASS / CALL count // Samples from Archive table have PASS/etc set. From Analysis table, the flag is empty (already counted) String filterString = se.getSampleData(sample, VariantMerger.VCF_FILTER); if (StringUtils.equals("PASS", filterString)) { addPassCount(1); } else { // Must count missing filter values! if (StringUtils.isBlank(filterString) || StringUtils.equals("-", filterString)) { filterString = "."; // Blank and '-' filters are saved together as missing } Set<Integer> set = filterToSamples.get(filterString); if (set == null) { set = new HashSet<>(); filterToSamples.put(filterString, set); } set.add(sid); } } addHomeRefCount(homref.size()); } catch (RuntimeException e) { throw new RuntimeException("Problems with " + variant.toJson(), e); } } @Override public String toString() { return chromosome + ':' + pos + ':' + ref + ':' + alt; } public String toSummaryString() { return String.format( "Submit %s: pass: %s; call: %s; hr: %s; 0/1: %s; 1/1: %s; ?: %s; .: %s", getPos(), getPassCount(), getCallCount(), getHomRefCount(), Arrays.toString(getSampleIds(HET_REF).toArray()), Arrays.toString(getSampleIds(HOM_VAR).toArray()), Arrays.toString(getSampleIds(OTHER).toArray()), Arrays.toString(getSampleIds(NOCALL).toArray()) ); } }